]>
Commit | Line | Data |
---|---|---|
1da177e4 LT |
1 | /* |
2 | * IA-32 Huge TLB Page Support for Kernel. | |
3 | * | |
4 | * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> | |
5 | */ | |
6 | ||
1da177e4 LT |
7 | #include <linux/init.h> |
8 | #include <linux/fs.h> | |
9 | #include <linux/mm.h> | |
10 | #include <linux/hugetlb.h> | |
11 | #include <linux/pagemap.h> | |
1da177e4 LT |
12 | #include <linux/slab.h> |
13 | #include <linux/err.h> | |
14 | #include <linux/sysctl.h> | |
15 | #include <asm/mman.h> | |
16 | #include <asm/tlb.h> | |
17 | #include <asm/tlbflush.h> | |
a5a19c63 | 18 | #include <asm/pgalloc.h> |
1da177e4 | 19 | |
39dde65c KC |
20 | static unsigned long page_table_shareable(struct vm_area_struct *svma, |
21 | struct vm_area_struct *vma, | |
22 | unsigned long addr, pgoff_t idx) | |
23 | { | |
24 | unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) + | |
25 | svma->vm_start; | |
26 | unsigned long sbase = saddr & PUD_MASK; | |
27 | unsigned long s_end = sbase + PUD_SIZE; | |
28 | ||
32b154c0 MG |
29 | /* Allow segments to share if only one is marked locked */ |
30 | unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED; | |
31 | unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED; | |
32 | ||
39dde65c KC |
33 | /* |
34 | * match the virtual addresses, permission and the alignment of the | |
35 | * page table page. | |
36 | */ | |
37 | if (pmd_index(addr) != pmd_index(saddr) || | |
32b154c0 | 38 | vm_flags != svm_flags || |
39dde65c KC |
39 | sbase < svma->vm_start || svma->vm_end < s_end) |
40 | return 0; | |
41 | ||
42 | return saddr; | |
43 | } | |
44 | ||
45 | static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) | |
46 | { | |
47 | unsigned long base = addr & PUD_MASK; | |
48 | unsigned long end = base + PUD_SIZE; | |
49 | ||
50 | /* | |
51 | * check on proper vm_flags and page table alignment | |
52 | */ | |
53 | if (vma->vm_flags & VM_MAYSHARE && | |
54 | vma->vm_start <= base && end <= vma->vm_end) | |
55 | return 1; | |
56 | return 0; | |
57 | } | |
58 | ||
59 | /* | |
60 | * search for a shareable pmd page for hugetlb. | |
61 | */ | |
62 | static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) | |
63 | { | |
64 | struct vm_area_struct *vma = find_vma(mm, addr); | |
65 | struct address_space *mapping = vma->vm_file->f_mapping; | |
66 | pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + | |
67 | vma->vm_pgoff; | |
68 | struct prio_tree_iter iter; | |
69 | struct vm_area_struct *svma; | |
70 | unsigned long saddr; | |
71 | pte_t *spte = NULL; | |
72 | ||
73 | if (!vma_shareable(vma, addr)) | |
74 | return; | |
75 | ||
76 | spin_lock(&mapping->i_mmap_lock); | |
77 | vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { | |
78 | if (svma == vma) | |
79 | continue; | |
80 | ||
81 | saddr = page_table_shareable(svma, vma, addr, idx); | |
82 | if (saddr) { | |
83 | spte = huge_pte_offset(svma->vm_mm, saddr); | |
84 | if (spte) { | |
85 | get_page(virt_to_page(spte)); | |
86 | break; | |
87 | } | |
88 | } | |
89 | } | |
90 | ||
91 | if (!spte) | |
92 | goto out; | |
93 | ||
94 | spin_lock(&mm->page_table_lock); | |
95 | if (pud_none(*pud)) | |
a5a19c63 | 96 | pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); |
39dde65c KC |
97 | else |
98 | put_page(virt_to_page(spte)); | |
99 | spin_unlock(&mm->page_table_lock); | |
100 | out: | |
101 | spin_unlock(&mapping->i_mmap_lock); | |
102 | } | |
103 | ||
104 | /* | |
105 | * unmap huge page backed by shared pte. | |
106 | * | |
107 | * Hugetlb pte page is ref counted at the time of mapping. If pte is shared | |
108 | * indicated by page_count > 1, unmap is achieved by clearing pud and | |
109 | * decrementing the ref count. If count == 1, the pte page is not shared. | |
110 | * | |
111 | * called with vma->vm_mm->page_table_lock held. | |
112 | * | |
113 | * returns: 1 successfully unmapped a shared pte page | |
114 | * 0 the underlying pte page is not shared, or it is the last user | |
115 | */ | |
116 | int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) | |
117 | { | |
118 | pgd_t *pgd = pgd_offset(mm, *addr); | |
119 | pud_t *pud = pud_offset(pgd, *addr); | |
120 | ||
121 | BUG_ON(page_count(virt_to_page(ptep)) == 0); | |
122 | if (page_count(virt_to_page(ptep)) == 1) | |
123 | return 0; | |
124 | ||
125 | pud_clear(pud); | |
126 | put_page(virt_to_page(ptep)); | |
127 | *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; | |
128 | return 1; | |
129 | } | |
130 | ||
a5516438 AK |
131 | pte_t *huge_pte_alloc(struct mm_struct *mm, |
132 | unsigned long addr, unsigned long sz) | |
1da177e4 LT |
133 | { |
134 | pgd_t *pgd; | |
135 | pud_t *pud; | |
7bf07f3d | 136 | pte_t *pte = NULL; |
1da177e4 LT |
137 | |
138 | pgd = pgd_offset(mm, addr); | |
139 | pud = pud_alloc(mm, pgd, addr); | |
39dde65c | 140 | if (pud) { |
39c11e6c AK |
141 | if (sz == PUD_SIZE) { |
142 | pte = (pte_t *)pud; | |
143 | } else { | |
144 | BUG_ON(sz != PMD_SIZE); | |
145 | if (pud_none(*pud)) | |
146 | huge_pmd_share(mm, addr, pud); | |
147 | pte = (pte_t *) pmd_alloc(mm, pud, addr); | |
148 | } | |
39dde65c | 149 | } |
0e5c9f39 | 150 | BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); |
7bf07f3d | 151 | |
7bf07f3d | 152 | return pte; |
1da177e4 LT |
153 | } |
154 | ||
63551ae0 | 155 | pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) |
1da177e4 LT |
156 | { |
157 | pgd_t *pgd; | |
158 | pud_t *pud; | |
159 | pmd_t *pmd = NULL; | |
160 | ||
161 | pgd = pgd_offset(mm, addr); | |
02b0ccef AL |
162 | if (pgd_present(*pgd)) { |
163 | pud = pud_offset(pgd, addr); | |
39c11e6c AK |
164 | if (pud_present(*pud)) { |
165 | if (pud_large(*pud)) | |
166 | return (pte_t *)pud; | |
02b0ccef | 167 | pmd = pmd_offset(pud, addr); |
39c11e6c | 168 | } |
02b0ccef | 169 | } |
1da177e4 LT |
170 | return (pte_t *) pmd; |
171 | } | |
172 | ||
1da177e4 LT |
173 | #if 0 /* This is just for testing */ |
174 | struct page * | |
175 | follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) | |
176 | { | |
177 | unsigned long start = address; | |
178 | int length = 1; | |
179 | int nr; | |
180 | struct page *page; | |
181 | struct vm_area_struct *vma; | |
182 | ||
183 | vma = find_vma(mm, addr); | |
184 | if (!vma || !is_vm_hugetlb_page(vma)) | |
185 | return ERR_PTR(-EINVAL); | |
186 | ||
187 | pte = huge_pte_offset(mm, address); | |
188 | ||
189 | /* hugetlb should be locked, and hence, prefaulted */ | |
190 | WARN_ON(!pte || pte_none(*pte)); | |
191 | ||
192 | page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; | |
193 | ||
25e59881 | 194 | WARN_ON(!PageHead(page)); |
1da177e4 LT |
195 | |
196 | return page; | |
197 | } | |
198 | ||
199 | int pmd_huge(pmd_t pmd) | |
200 | { | |
201 | return 0; | |
202 | } | |
203 | ||
ceb86879 AK |
204 | int pud_huge(pud_t pud) |
205 | { | |
206 | return 0; | |
207 | } | |
208 | ||
1da177e4 LT |
209 | struct page * |
210 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, | |
211 | pmd_t *pmd, int write) | |
212 | { | |
213 | return NULL; | |
214 | } | |
215 | ||
216 | #else | |
217 | ||
218 | struct page * | |
219 | follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) | |
220 | { | |
221 | return ERR_PTR(-EINVAL); | |
222 | } | |
223 | ||
224 | int pmd_huge(pmd_t pmd) | |
225 | { | |
226 | return !!(pmd_val(pmd) & _PAGE_PSE); | |
227 | } | |
228 | ||
ceb86879 AK |
229 | int pud_huge(pud_t pud) |
230 | { | |
39c11e6c | 231 | return !!(pud_val(pud) & _PAGE_PSE); |
ceb86879 AK |
232 | } |
233 | ||
1da177e4 LT |
234 | struct page * |
235 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, | |
236 | pmd_t *pmd, int write) | |
237 | { | |
238 | struct page *page; | |
239 | ||
240 | page = pte_page(*(pte_t *)pmd); | |
241 | if (page) | |
ceb86879 | 242 | page += ((address & ~PMD_MASK) >> PAGE_SHIFT); |
1da177e4 LT |
243 | return page; |
244 | } | |
ceb86879 AK |
245 | |
246 | struct page * | |
247 | follow_huge_pud(struct mm_struct *mm, unsigned long address, | |
248 | pud_t *pud, int write) | |
249 | { | |
250 | struct page *page; | |
251 | ||
252 | page = pte_page(*(pte_t *)pud); | |
253 | if (page) | |
254 | page += ((address & ~PUD_MASK) >> PAGE_SHIFT); | |
255 | return page; | |
256 | } | |
257 | ||
1da177e4 LT |
258 | #endif |
259 | ||
1da177e4 LT |
260 | /* x86_64 also uses this file */ |
261 | ||
262 | #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA | |
263 | static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, | |
264 | unsigned long addr, unsigned long len, | |
265 | unsigned long pgoff, unsigned long flags) | |
266 | { | |
39c11e6c | 267 | struct hstate *h = hstate_file(file); |
1da177e4 LT |
268 | struct mm_struct *mm = current->mm; |
269 | struct vm_area_struct *vma; | |
270 | unsigned long start_addr; | |
271 | ||
1363c3cd WW |
272 | if (len > mm->cached_hole_size) { |
273 | start_addr = mm->free_area_cache; | |
274 | } else { | |
275 | start_addr = TASK_UNMAPPED_BASE; | |
276 | mm->cached_hole_size = 0; | |
277 | } | |
1da177e4 LT |
278 | |
279 | full_search: | |
39c11e6c | 280 | addr = ALIGN(start_addr, huge_page_size(h)); |
1da177e4 LT |
281 | |
282 | for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { | |
283 | /* At this point: (!vma || addr < vma->vm_end). */ | |
284 | if (TASK_SIZE - len < addr) { | |
285 | /* | |
286 | * Start a new search - just in case we missed | |
287 | * some holes. | |
288 | */ | |
289 | if (start_addr != TASK_UNMAPPED_BASE) { | |
290 | start_addr = TASK_UNMAPPED_BASE; | |
1363c3cd | 291 | mm->cached_hole_size = 0; |
1da177e4 LT |
292 | goto full_search; |
293 | } | |
294 | return -ENOMEM; | |
295 | } | |
296 | if (!vma || addr + len <= vma->vm_start) { | |
297 | mm->free_area_cache = addr + len; | |
298 | return addr; | |
299 | } | |
1363c3cd WW |
300 | if (addr + mm->cached_hole_size < vma->vm_start) |
301 | mm->cached_hole_size = vma->vm_start - addr; | |
39c11e6c | 302 | addr = ALIGN(vma->vm_end, huge_page_size(h)); |
1da177e4 LT |
303 | } |
304 | } | |
305 | ||
306 | static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, | |
307 | unsigned long addr0, unsigned long len, | |
308 | unsigned long pgoff, unsigned long flags) | |
309 | { | |
39c11e6c | 310 | struct hstate *h = hstate_file(file); |
1da177e4 LT |
311 | struct mm_struct *mm = current->mm; |
312 | struct vm_area_struct *vma, *prev_vma; | |
313 | unsigned long base = mm->mmap_base, addr = addr0; | |
1363c3cd | 314 | unsigned long largest_hole = mm->cached_hole_size; |
1da177e4 LT |
315 | int first_time = 1; |
316 | ||
317 | /* don't allow allocations above current base */ | |
318 | if (mm->free_area_cache > base) | |
319 | mm->free_area_cache = base; | |
320 | ||
1363c3cd WW |
321 | if (len <= largest_hole) { |
322 | largest_hole = 0; | |
323 | mm->free_area_cache = base; | |
324 | } | |
1da177e4 LT |
325 | try_again: |
326 | /* make sure it can fit in the remaining address space */ | |
327 | if (mm->free_area_cache < len) | |
328 | goto fail; | |
329 | ||
330 | /* either no address requested or cant fit in requested address hole */ | |
39c11e6c | 331 | addr = (mm->free_area_cache - len) & huge_page_mask(h); |
1da177e4 LT |
332 | do { |
333 | /* | |
334 | * Lookup failure means no vma is above this address, | |
335 | * i.e. return with success: | |
336 | */ | |
337 | if (!(vma = find_vma_prev(mm, addr, &prev_vma))) | |
338 | return addr; | |
339 | ||
340 | /* | |
341 | * new region fits between prev_vma->vm_end and | |
342 | * vma->vm_start, use it: | |
343 | */ | |
344 | if (addr + len <= vma->vm_start && | |
1363c3cd | 345 | (!prev_vma || (addr >= prev_vma->vm_end))) { |
1da177e4 | 346 | /* remember the address as a hint for next time */ |
1363c3cd WW |
347 | mm->cached_hole_size = largest_hole; |
348 | return (mm->free_area_cache = addr); | |
349 | } else { | |
1da177e4 | 350 | /* pull free_area_cache down to the first hole */ |
1363c3cd | 351 | if (mm->free_area_cache == vma->vm_end) { |
1da177e4 | 352 | mm->free_area_cache = vma->vm_start; |
1363c3cd WW |
353 | mm->cached_hole_size = largest_hole; |
354 | } | |
355 | } | |
356 | ||
357 | /* remember the largest hole we saw so far */ | |
358 | if (addr + largest_hole < vma->vm_start) | |
359 | largest_hole = vma->vm_start - addr; | |
1da177e4 LT |
360 | |
361 | /* try just below the current vma->vm_start */ | |
39c11e6c | 362 | addr = (vma->vm_start - len) & huge_page_mask(h); |
1da177e4 LT |
363 | } while (len <= vma->vm_start); |
364 | ||
365 | fail: | |
366 | /* | |
367 | * if hint left us with no space for the requested | |
368 | * mapping then try again: | |
369 | */ | |
370 | if (first_time) { | |
371 | mm->free_area_cache = base; | |
1363c3cd | 372 | largest_hole = 0; |
1da177e4 LT |
373 | first_time = 0; |
374 | goto try_again; | |
375 | } | |
376 | /* | |
377 | * A failed mmap() very likely causes application failure, | |
378 | * so fall back to the bottom-up function here. This scenario | |
379 | * can happen with large stack limits and large mmap() | |
380 | * allocations. | |
381 | */ | |
382 | mm->free_area_cache = TASK_UNMAPPED_BASE; | |
1363c3cd | 383 | mm->cached_hole_size = ~0UL; |
1da177e4 LT |
384 | addr = hugetlb_get_unmapped_area_bottomup(file, addr0, |
385 | len, pgoff, flags); | |
386 | ||
387 | /* | |
388 | * Restore the topdown base: | |
389 | */ | |
390 | mm->free_area_cache = base; | |
1363c3cd | 391 | mm->cached_hole_size = ~0UL; |
1da177e4 LT |
392 | |
393 | return addr; | |
394 | } | |
395 | ||
396 | unsigned long | |
397 | hugetlb_get_unmapped_area(struct file *file, unsigned long addr, | |
398 | unsigned long len, unsigned long pgoff, unsigned long flags) | |
399 | { | |
39c11e6c | 400 | struct hstate *h = hstate_file(file); |
1da177e4 LT |
401 | struct mm_struct *mm = current->mm; |
402 | struct vm_area_struct *vma; | |
403 | ||
39c11e6c | 404 | if (len & ~huge_page_mask(h)) |
1da177e4 LT |
405 | return -EINVAL; |
406 | if (len > TASK_SIZE) | |
407 | return -ENOMEM; | |
408 | ||
5a8130f2 | 409 | if (flags & MAP_FIXED) { |
a5516438 | 410 | if (prepare_hugepage_range(file, addr, len)) |
5a8130f2 BH |
411 | return -EINVAL; |
412 | return addr; | |
413 | } | |
414 | ||
1da177e4 | 415 | if (addr) { |
39c11e6c | 416 | addr = ALIGN(addr, huge_page_size(h)); |
1da177e4 LT |
417 | vma = find_vma(mm, addr); |
418 | if (TASK_SIZE - len >= addr && | |
419 | (!vma || addr + len <= vma->vm_start)) | |
420 | return addr; | |
421 | } | |
422 | if (mm->get_unmapped_area == arch_get_unmapped_area) | |
423 | return hugetlb_get_unmapped_area_bottomup(file, addr, len, | |
424 | pgoff, flags); | |
425 | else | |
426 | return hugetlb_get_unmapped_area_topdown(file, addr, len, | |
427 | pgoff, flags); | |
428 | } | |
429 | ||
430 | #endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/ | |
431 | ||
b4718e62 AK |
432 | #ifdef CONFIG_X86_64 |
433 | static __init int setup_hugepagesz(char *opt) | |
434 | { | |
435 | unsigned long ps = memparse(opt, &opt); | |
436 | if (ps == PMD_SIZE) { | |
437 | hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); | |
438 | } else if (ps == PUD_SIZE && cpu_has_gbpages) { | |
439 | hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); | |
440 | } else { | |
441 | printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n", | |
442 | ps >> 20); | |
443 | return 0; | |
444 | } | |
445 | return 1; | |
446 | } | |
447 | __setup("hugepagesz=", setup_hugepagesz); | |
448 | #endif |