]>
Commit | Line | Data |
---|---|---|
1da177e4 LT |
1 | /* |
2 | * IA-32 Huge TLB Page Support for Kernel. | |
3 | * | |
4 | * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> | |
5 | */ | |
6 | ||
1da177e4 LT |
7 | #include <linux/init.h> |
8 | #include <linux/fs.h> | |
9 | #include <linux/mm.h> | |
10 | #include <linux/hugetlb.h> | |
11 | #include <linux/pagemap.h> | |
1da177e4 LT |
12 | #include <linux/err.h> |
13 | #include <linux/sysctl.h> | |
14 | #include <asm/mman.h> | |
15 | #include <asm/tlb.h> | |
16 | #include <asm/tlbflush.h> | |
a5a19c63 | 17 | #include <asm/pgalloc.h> |
1da177e4 | 18 | |
39dde65c KC |
19 | static unsigned long page_table_shareable(struct vm_area_struct *svma, |
20 | struct vm_area_struct *vma, | |
21 | unsigned long addr, pgoff_t idx) | |
22 | { | |
23 | unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + | |
24 | svma->vm_start; | |
25 | unsigned long sbase = saddr & PUD_MASK; | |
26 | unsigned long s_end = sbase + PUD_SIZE; | |
27 | ||
32b154c0 MG |
28 | /* Allow segments to share if only one is marked locked */ |
29 | unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED; | |
30 | unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED; | |
31 | ||
39dde65c KC |
32 | /* |
33 | * match the virtual addresses, permission and the alignment of the | |
34 | * page table page. | |
35 | */ | |
36 | if (pmd_index(addr) != pmd_index(saddr) || | |
32b154c0 | 37 | vm_flags != svm_flags || |
39dde65c KC |
38 | sbase < svma->vm_start || svma->vm_end < s_end) |
39 | return 0; | |
40 | ||
41 | return saddr; | |
42 | } | |
43 | ||
44 | static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) | |
45 | { | |
46 | unsigned long base = addr & PUD_MASK; | |
47 | unsigned long end = base + PUD_SIZE; | |
48 | ||
49 | /* | |
50 | * check on proper vm_flags and page table alignment | |
51 | */ | |
52 | if (vma->vm_flags & VM_MAYSHARE && | |
53 | vma->vm_start <= base && end <= vma->vm_end) | |
54 | return 1; | |
55 | return 0; | |
56 | } | |
57 | ||
58 | /* | |
eb48c071 MH |
59 | * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() |
60 | * and returns the corresponding pte. While this is not necessary for the | |
61 | * !shared pmd case because we can allocate the pmd later as well, it makes the | |
62 | * code much cleaner. pmd allocation is essential for the shared case because | |
63 | * pud has to be populated inside the same i_mmap_mutex section - otherwise | |
64 | * racing tasks could either miss the sharing (see huge_pte_offset) or select a | |
65 | * bad pmd for sharing. | |
39dde65c | 66 | */ |
eb48c071 MH |
67 | static pte_t * |
68 | huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) | |
39dde65c KC |
69 | { |
70 | struct vm_area_struct *vma = find_vma(mm, addr); | |
71 | struct address_space *mapping = vma->vm_file->f_mapping; | |
72 | pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + | |
73 | vma->vm_pgoff; | |
39dde65c KC |
74 | struct vm_area_struct *svma; |
75 | unsigned long saddr; | |
76 | pte_t *spte = NULL; | |
eb48c071 | 77 | pte_t *pte; |
39dde65c KC |
78 | |
79 | if (!vma_shareable(vma, addr)) | |
eb48c071 | 80 | return (pte_t *)pmd_alloc(mm, pud, addr); |
39dde65c | 81 | |
3d48ae45 | 82 | mutex_lock(&mapping->i_mmap_mutex); |
6b2dbba8 | 83 | vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { |
39dde65c KC |
84 | if (svma == vma) |
85 | continue; | |
86 | ||
87 | saddr = page_table_shareable(svma, vma, addr, idx); | |
88 | if (saddr) { | |
89 | spte = huge_pte_offset(svma->vm_mm, saddr); | |
90 | if (spte) { | |
91 | get_page(virt_to_page(spte)); | |
92 | break; | |
93 | } | |
94 | } | |
95 | } | |
96 | ||
97 | if (!spte) | |
98 | goto out; | |
99 | ||
100 | spin_lock(&mm->page_table_lock); | |
101 | if (pud_none(*pud)) | |
a5a19c63 | 102 | pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); |
39dde65c KC |
103 | else |
104 | put_page(virt_to_page(spte)); | |
105 | spin_unlock(&mm->page_table_lock); | |
106 | out: | |
eb48c071 | 107 | pte = (pte_t *)pmd_alloc(mm, pud, addr); |
3d48ae45 | 108 | mutex_unlock(&mapping->i_mmap_mutex); |
eb48c071 | 109 | return pte; |
39dde65c KC |
110 | } |
111 | ||
112 | /* | |
113 | * unmap huge page backed by shared pte. | |
114 | * | |
115 | * Hugetlb pte page is ref counted at the time of mapping. If pte is shared | |
116 | * indicated by page_count > 1, unmap is achieved by clearing pud and | |
117 | * decrementing the ref count. If count == 1, the pte page is not shared. | |
118 | * | |
119 | * called with vma->vm_mm->page_table_lock held. | |
120 | * | |
121 | * returns: 1 successfully unmapped a shared pte page | |
122 | * 0 the underlying pte page is not shared, or it is the last user | |
123 | */ | |
124 | int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) | |
125 | { | |
126 | pgd_t *pgd = pgd_offset(mm, *addr); | |
127 | pud_t *pud = pud_offset(pgd, *addr); | |
128 | ||
129 | BUG_ON(page_count(virt_to_page(ptep)) == 0); | |
130 | if (page_count(virt_to_page(ptep)) == 1) | |
131 | return 0; | |
132 | ||
133 | pud_clear(pud); | |
134 | put_page(virt_to_page(ptep)); | |
135 | *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; | |
136 | return 1; | |
137 | } | |
138 | ||
a5516438 AK |
139 | pte_t *huge_pte_alloc(struct mm_struct *mm, |
140 | unsigned long addr, unsigned long sz) | |
1da177e4 LT |
141 | { |
142 | pgd_t *pgd; | |
143 | pud_t *pud; | |
7bf07f3d | 144 | pte_t *pte = NULL; |
1da177e4 LT |
145 | |
146 | pgd = pgd_offset(mm, addr); | |
147 | pud = pud_alloc(mm, pgd, addr); | |
39dde65c | 148 | if (pud) { |
39c11e6c AK |
149 | if (sz == PUD_SIZE) { |
150 | pte = (pte_t *)pud; | |
151 | } else { | |
152 | BUG_ON(sz != PMD_SIZE); | |
153 | if (pud_none(*pud)) | |
eb48c071 MH |
154 | pte = huge_pmd_share(mm, addr, pud); |
155 | else | |
156 | pte = (pte_t *)pmd_alloc(mm, pud, addr); | |
39c11e6c | 157 | } |
39dde65c | 158 | } |
0e5c9f39 | 159 | BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); |
7bf07f3d | 160 | |
7bf07f3d | 161 | return pte; |
1da177e4 LT |
162 | } |
163 | ||
63551ae0 | 164 | pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) |
1da177e4 LT |
165 | { |
166 | pgd_t *pgd; | |
167 | pud_t *pud; | |
168 | pmd_t *pmd = NULL; | |
169 | ||
170 | pgd = pgd_offset(mm, addr); | |
02b0ccef AL |
171 | if (pgd_present(*pgd)) { |
172 | pud = pud_offset(pgd, addr); | |
39c11e6c AK |
173 | if (pud_present(*pud)) { |
174 | if (pud_large(*pud)) | |
175 | return (pte_t *)pud; | |
02b0ccef | 176 | pmd = pmd_offset(pud, addr); |
39c11e6c | 177 | } |
02b0ccef | 178 | } |
1da177e4 LT |
179 | return (pte_t *) pmd; |
180 | } | |
181 | ||
1da177e4 LT |
182 | #if 0 /* This is just for testing */ |
183 | struct page * | |
184 | follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) | |
185 | { | |
186 | unsigned long start = address; | |
187 | int length = 1; | |
188 | int nr; | |
189 | struct page *page; | |
190 | struct vm_area_struct *vma; | |
191 | ||
192 | vma = find_vma(mm, addr); | |
193 | if (!vma || !is_vm_hugetlb_page(vma)) | |
194 | return ERR_PTR(-EINVAL); | |
195 | ||
196 | pte = huge_pte_offset(mm, address); | |
197 | ||
198 | /* hugetlb should be locked, and hence, prefaulted */ | |
199 | WARN_ON(!pte || pte_none(*pte)); | |
200 | ||
201 | page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; | |
202 | ||
25e59881 | 203 | WARN_ON(!PageHead(page)); |
1da177e4 LT |
204 | |
205 | return page; | |
206 | } | |
207 | ||
208 | int pmd_huge(pmd_t pmd) | |
209 | { | |
210 | return 0; | |
211 | } | |
212 | ||
ceb86879 AK |
213 | int pud_huge(pud_t pud) |
214 | { | |
215 | return 0; | |
216 | } | |
217 | ||
1da177e4 LT |
218 | struct page * |
219 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, | |
220 | pmd_t *pmd, int write) | |
221 | { | |
222 | return NULL; | |
223 | } | |
224 | ||
225 | #else | |
226 | ||
227 | struct page * | |
228 | follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) | |
229 | { | |
230 | return ERR_PTR(-EINVAL); | |
231 | } | |
232 | ||
233 | int pmd_huge(pmd_t pmd) | |
234 | { | |
235 | return !!(pmd_val(pmd) & _PAGE_PSE); | |
236 | } | |
237 | ||
ceb86879 AK |
238 | int pud_huge(pud_t pud) |
239 | { | |
39c11e6c | 240 | return !!(pud_val(pud) & _PAGE_PSE); |
ceb86879 AK |
241 | } |
242 | ||
1da177e4 LT |
243 | struct page * |
244 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, | |
245 | pmd_t *pmd, int write) | |
246 | { | |
247 | struct page *page; | |
248 | ||
249 | page = pte_page(*(pte_t *)pmd); | |
250 | if (page) | |
ceb86879 | 251 | page += ((address & ~PMD_MASK) >> PAGE_SHIFT); |
1da177e4 LT |
252 | return page; |
253 | } | |
ceb86879 AK |
254 | |
255 | struct page * | |
256 | follow_huge_pud(struct mm_struct *mm, unsigned long address, | |
257 | pud_t *pud, int write) | |
258 | { | |
259 | struct page *page; | |
260 | ||
261 | page = pte_page(*(pte_t *)pud); | |
262 | if (page) | |
263 | page += ((address & ~PUD_MASK) >> PAGE_SHIFT); | |
264 | return page; | |
265 | } | |
266 | ||
1da177e4 LT |
267 | #endif |
268 | ||
1da177e4 LT |
269 | /* x86_64 also uses this file */ |
270 | ||
271 | #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA | |
272 | static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, | |
273 | unsigned long addr, unsigned long len, | |
274 | unsigned long pgoff, unsigned long flags) | |
275 | { | |
39c11e6c | 276 | struct hstate *h = hstate_file(file); |
1da177e4 LT |
277 | struct mm_struct *mm = current->mm; |
278 | struct vm_area_struct *vma; | |
279 | unsigned long start_addr; | |
280 | ||
1363c3cd WW |
281 | if (len > mm->cached_hole_size) { |
282 | start_addr = mm->free_area_cache; | |
283 | } else { | |
284 | start_addr = TASK_UNMAPPED_BASE; | |
285 | mm->cached_hole_size = 0; | |
286 | } | |
1da177e4 LT |
287 | |
288 | full_search: | |
39c11e6c | 289 | addr = ALIGN(start_addr, huge_page_size(h)); |
1da177e4 LT |
290 | |
291 | for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { | |
292 | /* At this point: (!vma || addr < vma->vm_end). */ | |
293 | if (TASK_SIZE - len < addr) { | |
294 | /* | |
295 | * Start a new search - just in case we missed | |
296 | * some holes. | |
297 | */ | |
298 | if (start_addr != TASK_UNMAPPED_BASE) { | |
299 | start_addr = TASK_UNMAPPED_BASE; | |
1363c3cd | 300 | mm->cached_hole_size = 0; |
1da177e4 LT |
301 | goto full_search; |
302 | } | |
303 | return -ENOMEM; | |
304 | } | |
305 | if (!vma || addr + len <= vma->vm_start) { | |
306 | mm->free_area_cache = addr + len; | |
307 | return addr; | |
308 | } | |
1363c3cd WW |
309 | if (addr + mm->cached_hole_size < vma->vm_start) |
310 | mm->cached_hole_size = vma->vm_start - addr; | |
39c11e6c | 311 | addr = ALIGN(vma->vm_end, huge_page_size(h)); |
1da177e4 LT |
312 | } |
313 | } | |
314 | ||
315 | static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, | |
316 | unsigned long addr0, unsigned long len, | |
317 | unsigned long pgoff, unsigned long flags) | |
318 | { | |
39c11e6c | 319 | struct hstate *h = hstate_file(file); |
1da177e4 | 320 | struct mm_struct *mm = current->mm; |
b69add21 | 321 | struct vm_area_struct *vma; |
cbde83e2 XG |
322 | unsigned long base = mm->mmap_base; |
323 | unsigned long addr = addr0; | |
1363c3cd | 324 | unsigned long largest_hole = mm->cached_hole_size; |
cbde83e2 | 325 | unsigned long start_addr; |
1da177e4 LT |
326 | |
327 | /* don't allow allocations above current base */ | |
328 | if (mm->free_area_cache > base) | |
329 | mm->free_area_cache = base; | |
330 | ||
1363c3cd WW |
331 | if (len <= largest_hole) { |
332 | largest_hole = 0; | |
333 | mm->free_area_cache = base; | |
334 | } | |
1da177e4 | 335 | try_again: |
cbde83e2 XG |
336 | start_addr = mm->free_area_cache; |
337 | ||
1da177e4 LT |
338 | /* make sure it can fit in the remaining address space */ |
339 | if (mm->free_area_cache < len) | |
340 | goto fail; | |
341 | ||
0d2eb44f | 342 | /* either no address requested or can't fit in requested address hole */ |
39c11e6c | 343 | addr = (mm->free_area_cache - len) & huge_page_mask(h); |
1da177e4 LT |
344 | do { |
345 | /* | |
346 | * Lookup failure means no vma is above this address, | |
347 | * i.e. return with success: | |
348 | */ | |
55062d06 | 349 | vma = find_vma(mm, addr); |
097d5910 | 350 | if (!vma) |
1da177e4 LT |
351 | return addr; |
352 | ||
b69add21 | 353 | if (addr + len <= vma->vm_start) { |
1da177e4 | 354 | /* remember the address as a hint for next time */ |
1363c3cd WW |
355 | mm->cached_hole_size = largest_hole; |
356 | return (mm->free_area_cache = addr); | |
b69add21 | 357 | } else if (mm->free_area_cache == vma->vm_end) { |
1da177e4 | 358 | /* pull free_area_cache down to the first hole */ |
b69add21 XG |
359 | mm->free_area_cache = vma->vm_start; |
360 | mm->cached_hole_size = largest_hole; | |
1363c3cd WW |
361 | } |
362 | ||
363 | /* remember the largest hole we saw so far */ | |
364 | if (addr + largest_hole < vma->vm_start) | |
365 | largest_hole = vma->vm_start - addr; | |
1da177e4 LT |
366 | |
367 | /* try just below the current vma->vm_start */ | |
39c11e6c | 368 | addr = (vma->vm_start - len) & huge_page_mask(h); |
1da177e4 LT |
369 | } while (len <= vma->vm_start); |
370 | ||
371 | fail: | |
372 | /* | |
373 | * if hint left us with no space for the requested | |
374 | * mapping then try again: | |
375 | */ | |
cbde83e2 | 376 | if (start_addr != base) { |
1da177e4 | 377 | mm->free_area_cache = base; |
1363c3cd | 378 | largest_hole = 0; |
1da177e4 LT |
379 | goto try_again; |
380 | } | |
381 | /* | |
382 | * A failed mmap() very likely causes application failure, | |
383 | * so fall back to the bottom-up function here. This scenario | |
384 | * can happen with large stack limits and large mmap() | |
385 | * allocations. | |
386 | */ | |
387 | mm->free_area_cache = TASK_UNMAPPED_BASE; | |
1363c3cd | 388 | mm->cached_hole_size = ~0UL; |
1da177e4 LT |
389 | addr = hugetlb_get_unmapped_area_bottomup(file, addr0, |
390 | len, pgoff, flags); | |
391 | ||
392 | /* | |
393 | * Restore the topdown base: | |
394 | */ | |
395 | mm->free_area_cache = base; | |
1363c3cd | 396 | mm->cached_hole_size = ~0UL; |
1da177e4 LT |
397 | |
398 | return addr; | |
399 | } | |
400 | ||
401 | unsigned long | |
402 | hugetlb_get_unmapped_area(struct file *file, unsigned long addr, | |
403 | unsigned long len, unsigned long pgoff, unsigned long flags) | |
404 | { | |
39c11e6c | 405 | struct hstate *h = hstate_file(file); |
1da177e4 LT |
406 | struct mm_struct *mm = current->mm; |
407 | struct vm_area_struct *vma; | |
408 | ||
39c11e6c | 409 | if (len & ~huge_page_mask(h)) |
1da177e4 LT |
410 | return -EINVAL; |
411 | if (len > TASK_SIZE) | |
412 | return -ENOMEM; | |
413 | ||
5a8130f2 | 414 | if (flags & MAP_FIXED) { |
a5516438 | 415 | if (prepare_hugepage_range(file, addr, len)) |
5a8130f2 BH |
416 | return -EINVAL; |
417 | return addr; | |
418 | } | |
419 | ||
1da177e4 | 420 | if (addr) { |
39c11e6c | 421 | addr = ALIGN(addr, huge_page_size(h)); |
1da177e4 LT |
422 | vma = find_vma(mm, addr); |
423 | if (TASK_SIZE - len >= addr && | |
424 | (!vma || addr + len <= vma->vm_start)) | |
425 | return addr; | |
426 | } | |
427 | if (mm->get_unmapped_area == arch_get_unmapped_area) | |
428 | return hugetlb_get_unmapped_area_bottomup(file, addr, len, | |
429 | pgoff, flags); | |
430 | else | |
431 | return hugetlb_get_unmapped_area_topdown(file, addr, len, | |
432 | pgoff, flags); | |
433 | } | |
434 | ||
435 | #endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/ | |
436 | ||
b4718e62 AK |
437 | #ifdef CONFIG_X86_64 |
438 | static __init int setup_hugepagesz(char *opt) | |
439 | { | |
440 | unsigned long ps = memparse(opt, &opt); | |
441 | if (ps == PMD_SIZE) { | |
442 | hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); | |
443 | } else if (ps == PUD_SIZE && cpu_has_gbpages) { | |
444 | hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); | |
445 | } else { | |
446 | printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n", | |
447 | ps >> 20); | |
448 | return 0; | |
449 | } | |
450 | return 1; | |
451 | } | |
452 | __setup("hugepagesz=", setup_hugepagesz); | |
453 | #endif |