arch/powerpc/mm/hugetlbpage.c

   1 /*
   2  * PPC64 (POWER4) Huge TLB Page Support for Kernel.
   3  *
   4  * Copyright (C) 2003 David Gibson, IBM Corporation.
   5  *
   6  * Based on the IA-32 version:
   7  * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
   8  */
   9
  10 #include <linux/init.h>
  11 #include <linux/fs.h>
  12 #include <linux/mm.h>
  13 #include <linux/hugetlb.h>
  14 #include <linux/pagemap.h>
  15 #include <linux/slab.h>
  16 #include <linux/err.h>
  17 #include <linux/sysctl.h>
  18 #include <asm/mman.h>
  19 #include <asm/pgalloc.h>
  20 #include <asm/tlb.h>
  21 #include <asm/tlbflush.h>
  22 #include <asm/mmu_context.h>
  23 #include <asm/machdep.h>
  24 #include <asm/cputable.h>
  25 #include <asm/spu.h>
  26
  27 #define NUM_LOW_AREAS   (0x100000000UL >> SID_SHIFT)
  28 #define NUM_HIGH_AREAS  (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
  29
  30 #ifdef CONFIG_PPC_64K_PAGES
  31 #define HUGEPTE_INDEX_SIZE      (PMD_SHIFT-HPAGE_SHIFT)
  32 #else
  33 #define HUGEPTE_INDEX_SIZE      (PUD_SHIFT-HPAGE_SHIFT)
  34 #endif
  35 #define PTRS_PER_HUGEPTE        (1 << HUGEPTE_INDEX_SIZE)
  36 #define HUGEPTE_TABLE_SIZE      (sizeof(pte_t) << HUGEPTE_INDEX_SIZE)
  37
  38 #define HUGEPD_SHIFT            (HPAGE_SHIFT + HUGEPTE_INDEX_SIZE)
  39 #define HUGEPD_SIZE             (1UL << HUGEPD_SHIFT)
  40 #define HUGEPD_MASK             (~(HUGEPD_SIZE-1))
  41
  42 #define huge_pgtable_cache      (pgtable_cache[HUGEPTE_CACHE_NUM])
  43
  44 /* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
  45  * will choke on pointers to hugepte tables, which is handy for
  46  * catching screwups early. */
  47 #define HUGEPD_OK       0x1
  48
  49 typedef struct { unsigned long pd; } hugepd_t;
  50
  51 #define hugepd_none(hpd)        ((hpd).pd == 0)
  52
  53 static inline pte_t *hugepd_page(hugepd_t hpd)
  54 {
  55         BUG_ON(!(hpd.pd & HUGEPD_OK));
  56         return (pte_t *)(hpd.pd & ~HUGEPD_OK);
  57 }
  58
  59 static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr)
  60 {
  61         unsigned long idx = ((addr >> HPAGE_SHIFT) & (PTRS_PER_HUGEPTE-1));
  62         pte_t *dir = hugepd_page(*hpdp);
  63
  64         return dir + idx;
  65 }
  66
  67 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
  68                            unsigned long address)
  69 {
  70         pte_t *new = kmem_cache_alloc(huge_pgtable_cache,
  71                                       GFP_KERNEL|__GFP_REPEAT);
  72
  73         if (! new)
  74                 return -ENOMEM;
  75
  76         spin_lock(&mm->page_table_lock);
  77         if (!hugepd_none(*hpdp))
  78                 kmem_cache_free(huge_pgtable_cache, new);
  79         else
  80                 hpdp->pd = (unsigned long)new | HUGEPD_OK;
  81         spin_unlock(&mm->page_table_lock);
  82         return 0;
  83 }
  84
  85 /* Modelled after find_linux_pte() */
  86 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
  87 {
  88         pgd_t *pg;
  89         pud_t *pu;
  90
  91         BUG_ON(get_slice_psize(mm, addr) != mmu_huge_psize);
  92
  93         addr &= HPAGE_MASK;
  94
  95         pg = pgd_offset(mm, addr);
  96         if (!pgd_none(*pg)) {
  97                 pu = pud_offset(pg, addr);
  98                 if (!pud_none(*pu)) {
  99 #ifdef CONFIG_PPC_64K_PAGES
 100                         pmd_t *pm;
 101                         pm = pmd_offset(pu, addr);
 102                         if (!pmd_none(*pm))
 103                                 return hugepte_offset((hugepd_t *)pm, addr);
 104 #else
 105                         return hugepte_offset((hugepd_t *)pu, addr);
 106 #endif
 107                 }
 108         }
 109
 110         return NULL;
 111 }
 112
 113 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 114 {
 115         pgd_t *pg;
 116         pud_t *pu;
 117         hugepd_t *hpdp = NULL;
 118
 119         BUG_ON(get_slice_psize(mm, addr) != mmu_huge_psize);
 120
 121         addr &= HPAGE_MASK;
 122
 123         pg = pgd_offset(mm, addr);
 124         pu = pud_alloc(mm, pg, addr);
 125
 126         if (pu) {
 127 #ifdef CONFIG_PPC_64K_PAGES
 128                 pmd_t *pm;
 129                 pm = pmd_alloc(mm, pu, addr);
 130                 if (pm)
 131                         hpdp = (hugepd_t *)pm;
 132 #else
 133                 hpdp = (hugepd_t *)pu;
 134 #endif
 135         }
 136
 137         if (! hpdp)
 138                 return NULL;
 139
 140         if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr))
 141                 return NULL;
 142
 143         return hugepte_offset(hpdp, addr);
 144 }
 145
 146 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 147 {
 148         return 0;
 149 }
 150
 151 static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp)
 152 {
 153         pte_t *hugepte = hugepd_page(*hpdp);
 154
 155         hpdp->pd = 0;
 156         tlb->need_flush = 1;
 157         pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, HUGEPTE_CACHE_NUM,
 158                                                  PGF_CACHENUM_MASK));
 159 }
 160
 161 #ifdef CONFIG_PPC_64K_PAGES
 162 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 163                                    unsigned long addr, unsigned long end,
 164                                    unsigned long floor, unsigned long ceiling)
 165 {
 166         pmd_t *pmd;
 167         unsigned long next;
 168         unsigned long start;
 169
 170         start = addr;
 171         pmd = pmd_offset(pud, addr);
 172         do {
 173                 next = pmd_addr_end(addr, end);
 174                 if (pmd_none(*pmd))
 175                         continue;
 176                 free_hugepte_range(tlb, (hugepd_t *)pmd);
 177         } while (pmd++, addr = next, addr != end);
 178
 179         start &= PUD_MASK;
 180         if (start < floor)
 181                 return;
 182         if (ceiling) {
 183                 ceiling &= PUD_MASK;
 184                 if (!ceiling)
 185                         return;
 186         }
 187         if (end - 1 > ceiling - 1)
 188                 return;
 189
 190         pmd = pmd_offset(pud, start);
 191         pud_clear(pud);
 192         pmd_free_tlb(tlb, pmd);
 193 }
 194 #endif
 195
 196 static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 197                                    unsigned long addr, unsigned long end,
 198                                    unsigned long floor, unsigned long ceiling)
 199 {
 200         pud_t *pud;
 201         unsigned long next;
 202         unsigned long start;
 203
 204         start = addr;
 205         pud = pud_offset(pgd, addr);
 206         do {
 207                 next = pud_addr_end(addr, end);
 208 #ifdef CONFIG_PPC_64K_PAGES
 209                 if (pud_none_or_clear_bad(pud))
 210                         continue;
 211                 hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
 212 #else
 213                 if (pud_none(*pud))
 214                         continue;
 215                 free_hugepte_range(tlb, (hugepd_t *)pud);
 216 #endif
 217         } while (pud++, addr = next, addr != end);
 218
 219         start &= PGDIR_MASK;
 220         if (start < floor)
 221                 return;
 222         if (ceiling) {
 223                 ceiling &= PGDIR_MASK;
 224                 if (!ceiling)
 225                         return;
 226         }
 227         if (end - 1 > ceiling - 1)
 228                 return;
 229
 230         pud = pud_offset(pgd, start);
 231         pgd_clear(pgd);
 232         pud_free_tlb(tlb, pud);
 233 }
 234
 235 /*
 236  * This function frees user-level page tables of a process.
 237  *
 238  * Must be called with pagetable lock held.
 239  */
 240 void hugetlb_free_pgd_range(struct mmu_gather **tlb,
 241                             unsigned long addr, unsigned long end,
 242                             unsigned long floor, unsigned long ceiling)
 243 {
 244         pgd_t *pgd;
 245         unsigned long next;
 246         unsigned long start;
 247
 248         /*
 249          * Comments below take from the normal free_pgd_range().  They
 250          * apply here too.  The tests against HUGEPD_MASK below are
 251          * essential, because we *don't* test for this at the bottom
 252          * level.  Without them we'll attempt to free a hugepte table
 253          * when we unmap just part of it, even if there are other
 254          * active mappings using it.
 255          *
 256          * The next few lines have given us lots of grief...
 257          *
 258          * Why are we testing HUGEPD* at this top level?  Because
 259          * often there will be no work to do at all, and we'd prefer
 260          * not to go all the way down to the bottom just to discover
 261          * that.
 262          *
 263          * Why all these "- 1"s?  Because 0 represents both the bottom
 264          * of the address space and the top of it (using -1 for the
 265          * top wouldn't help much: the masks would do the wrong thing).
 266          * The rule is that addr 0 and floor 0 refer to the bottom of
 267          * the address space, but end 0 and ceiling 0 refer to the top
 268          * Comparisons need to use "end - 1" and "ceiling - 1" (though
 269          * that end 0 case should be mythical).
 270          *
 271          * Wherever addr is brought up or ceiling brought down, we
 272          * must be careful to reject "the opposite 0" before it
 273          * confuses the subsequent tests.  But what about where end is
 274          * brought down by HUGEPD_SIZE below? no, end can't go down to
 275          * 0 there.
 276          *
 277          * Whereas we round start (addr) and ceiling down, by different
 278          * masks at different levels, in order to test whether a table
 279          * now has no other vmas using it, so can be freed, we don't
 280          * bother to round floor or end up - the tests don't need that.
 281          */
 282
 283         addr &= HUGEPD_MASK;
 284         if (addr < floor) {
 285                 addr += HUGEPD_SIZE;
 286                 if (!addr)
 287                         return;
 288         }
 289         if (ceiling) {
 290                 ceiling &= HUGEPD_MASK;
 291                 if (!ceiling)
 292                         return;
 293         }
 294         if (end - 1 > ceiling - 1)
 295                 end -= HUGEPD_SIZE;
 296         if (addr > end - 1)
 297                 return;
 298
 299         start = addr;
 300         pgd = pgd_offset((*tlb)->mm, addr);
 301         do {
 302                 BUG_ON(get_slice_psize((*tlb)->mm, addr) != mmu_huge_psize);
 303                 next = pgd_addr_end(addr, end);
 304                 if (pgd_none_or_clear_bad(pgd))
 305                         continue;
 306                 hugetlb_free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
 307         } while (pgd++, addr = next, addr != end);
 308 }
 309
 310 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 311                      pte_t *ptep, pte_t pte)
 312 {
 313         if (pte_present(*ptep)) {
 314                 /* We open-code pte_clear because we need to pass the right
 315                  * argument to hpte_need_flush (huge / !huge). Might not be
 316                  * necessary anymore if we make hpte_need_flush() get the
 317                  * page size from the slices
 318                  */
 319                 pte_update(mm, addr & HPAGE_MASK, ptep, ~0UL, 1);
 320         }
 321         *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
 322 }
 323
 324 pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 325                               pte_t *ptep)
 326 {
 327         unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
 328         return __pte(old);
 329 }
 330
 331 struct page *
 332 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 333 {
 334         pte_t *ptep;
 335         struct page *page;
 336
 337         if (get_slice_psize(mm, address) != mmu_huge_psize)
 338                 return ERR_PTR(-EINVAL);
 339
 340         ptep = huge_pte_offset(mm, address);
 341         page = pte_page(*ptep);
 342         if (page)
 343                 page += (address % HPAGE_SIZE) / PAGE_SIZE;
 344
 345         return page;
 346 }
 347
 348 int pmd_huge(pmd_t pmd)
 349 {
 350         return 0;
 351 }
 352
 353 struct page *
 354 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 355                 pmd_t *pmd, int write)
 356 {
 357         BUG();
 358         return NULL;
 359 }
 360
 361
 362 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 363                                         unsigned long len, unsigned long pgoff,
 364                                         unsigned long flags)
 365 {
 366         return slice_get_unmapped_area(addr, len, flags,
 367                                        mmu_huge_psize, 1, 0);
 368 }
 369
 370 /*
 371  * Called by asm hashtable.S for doing lazy icache flush
 372  */
 373 static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
 374                                                   pte_t pte, int trap)
 375 {
 376         struct page *page;
 377         int i;
 378
 379         if (!pfn_valid(pte_pfn(pte)))
 380                 return rflags;
 381
 382         page = pte_page(pte);
 383
 384         /* page is dirty */
 385         if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
 386                 if (trap == 0x400) {
 387                         for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++)
 388                                 __flush_dcache_icache(page_address(page+i));
 389                         set_bit(PG_arch_1, &page->flags);
 390                 } else {
 391                         rflags |= HPTE_R_N;
 392                 }
 393         }
 394         return rflags;
 395 }
 396
 397 int hash_huge_page(struct mm_struct *mm, unsigned long access,
 398                    unsigned long ea, unsigned long vsid, int local,
 399                    unsigned long trap)
 400 {
 401         pte_t *ptep;
 402         unsigned long old_pte, new_pte;
 403         unsigned long va, rflags, pa;
 404         long slot;
 405         int err = 1;
 406         int ssize = user_segment_size(ea);
 407
 408         ptep = huge_pte_offset(mm, ea);
 409
 410         /* Search the Linux page table for a match with va */
 411         va = hpt_va(ea, vsid, ssize);
 412
 413         /*
 414          * If no pte found or not present, send the problem up to
 415          * do_page_fault
 416          */
 417         if (unlikely(!ptep || pte_none(*ptep)))
 418                 goto out;
 419
 420         /*
 421          * Check the user's access rights to the page.  If access should be
 422          * prevented then send the problem up to do_page_fault.
 423          */
 424         if (unlikely(access & ~pte_val(*ptep)))
 425                 goto out;
 426         /*
 427          * At this point, we have a pte (old_pte) which can be used to build
 428          * or update an HPTE. There are 2 cases:
 429          *
 430          * 1. There is a valid (present) pte with no associated HPTE (this is
 431          *      the most common case)
 432          * 2. There is a valid (present) pte with an associated HPTE. The
 433          *      current values of the pp bits in the HPTE prevent access
 434          *      because we are doing software DIRTY bit management and the
 435          *      page is currently not DIRTY.
 436          */
 437
 438
 439         do {
 440                 old_pte = pte_val(*ptep);
 441                 if (old_pte & _PAGE_BUSY)
 442                         goto out;
 443                 new_pte = old_pte | _PAGE_BUSY |
 444                         _PAGE_ACCESSED | _PAGE_HASHPTE;
 445         } while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
 446                                          old_pte, new_pte));
 447
 448         rflags = 0x2 | (!(new_pte & _PAGE_RW));
 449         /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
 450         rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
 451         if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
 452                 /* No CPU has hugepages but lacks no execute, so we
 453                  * don't need to worry about that case */
 454                 rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
 455                                                        trap);
 456
 457         /* Check if pte already has an hpte (case 2) */
 458         if (unlikely(old_pte & _PAGE_HASHPTE)) {
 459                 /* There MIGHT be an HPTE for this pte */
 460                 unsigned long hash, slot;
 461
 462                 hash = hpt_hash(va, HPAGE_SHIFT, ssize);
 463                 if (old_pte & _PAGE_F_SECOND)
 464                         hash = ~hash;
 465                 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
 466                 slot += (old_pte & _PAGE_F_GIX) >> 12;
 467
 468                 if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_huge_psize,
 469                                          ssize, local) == -1)
 470                         old_pte &= ~_PAGE_HPTEFLAGS;
 471         }
 472
 473         if (likely(!(old_pte & _PAGE_HASHPTE))) {
 474                 unsigned long hash = hpt_hash(va, HPAGE_SHIFT, ssize);
 475                 unsigned long hpte_group;
 476
 477                 pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
 478
 479 repeat:
 480                 hpte_group = ((hash & htab_hash_mask) *
 481                               HPTES_PER_GROUP) & ~0x7UL;
 482
 483                 /* clear HPTE slot informations in new PTE */
 484                 new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
 485
 486                 /* Add in WIMG bits */
 487                 /* XXX We should store these in the pte */
 488                 /* --BenH: I think they are ... */
 489                 rflags |= _PAGE_COHERENT;
 490
 491                 /* Insert into the hash table, primary slot */
 492                 slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
 493                                           mmu_huge_psize, ssize);
 494
 495                 /* Primary is full, try the secondary */
 496                 if (unlikely(slot == -1)) {
 497                         hpte_group = ((~hash & htab_hash_mask) *
 498                                       HPTES_PER_GROUP) & ~0x7UL;
 499                         slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
 500                                                   HPTE_V_SECONDARY,
 501                                                   mmu_huge_psize, ssize);
 502                         if (slot == -1) {
 503                                 if (mftb() & 0x1)
 504                                         hpte_group = ((hash & htab_hash_mask) *
 505                                                       HPTES_PER_GROUP)&~0x7UL;
 506
 507                                 ppc_md.hpte_remove(hpte_group);
 508                                 goto repeat;
 509                         }
 510                 }
 511
 512                 if (unlikely(slot == -2))
 513                         panic("hash_huge_page: pte_insert failed\n");
 514
 515                 new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
 516         }
 517
 518         /*
 519          * No need to use ldarx/stdcx here
 520          */
 521         *ptep = __pte(new_pte & ~_PAGE_BUSY);
 522
 523         err = 0;
 524
 525  out:
 526         return err;
 527 }
 528
 529 static void zero_ctor(struct kmem_cache *cache, void *addr)
 530 {
 531         memset(addr, 0, kmem_cache_size(cache));
 532 }
 533
 534 static int __init hugetlbpage_init(void)
 535 {
 536         if (!cpu_has_feature(CPU_FTR_16M_PAGE))
 537                 return -ENODEV;
 538
 539         huge_pgtable_cache = kmem_cache_create("hugepte_cache",
 540                                                HUGEPTE_TABLE_SIZE,
 541                                                HUGEPTE_TABLE_SIZE,
 542                                                0,
 543                                                zero_ctor);
 544         if (! huge_pgtable_cache)
 545                 panic("hugetlbpage_init(): could not create hugepte cache\n");
 546
 547         return 0;
 548 }
 549
 550 module_init(hugetlbpage_init);