]>
Commit | Line | Data |
---|---|---|
2aae950b | 1 | /* |
2aae950b AK |
2 | * Copyright 2007 Andi Kleen, SUSE Labs. |
3 | * Subject to the GPL, v.2 | |
1c0c1b93 AL |
4 | * |
5 | * This contains most of the x86 vDSO kernel-side code. | |
2aae950b AK |
6 | */ |
7 | #include <linux/mm.h> | |
4e950f6f | 8 | #include <linux/err.h> |
2aae950b | 9 | #include <linux/sched.h> |
5a0e3ad6 | 10 | #include <linux/slab.h> |
2aae950b AK |
11 | #include <linux/init.h> |
12 | #include <linux/random.h> | |
3fa89ca7 | 13 | #include <linux/elf.h> |
d4f829dd | 14 | #include <linux/cpu.h> |
b059a453 | 15 | #include <linux/ptrace.h> |
cc1e24fd | 16 | #include <asm/pvclock.h> |
2aae950b AK |
17 | #include <asm/vgtod.h> |
18 | #include <asm/proto.h> | |
7f3646aa | 19 | #include <asm/vdso.h> |
1c0c1b93 | 20 | #include <asm/vvar.h> |
aafade24 | 21 | #include <asm/page.h> |
d4f829dd | 22 | #include <asm/desc.h> |
cd4d09ec | 23 | #include <asm/cpufeature.h> |
2aae950b | 24 | |
b4b541a6 | 25 | #if defined(CONFIG_X86_64) |
3d7ee969 | 26 | unsigned int __read_mostly vdso64_enabled = 1; |
b4b541a6 | 27 | #endif |
1a21d4e0 | 28 | |
6f121e54 | 29 | void __init init_vdso_image(const struct vdso_image *image) |
1a21d4e0 | 30 | { |
6f121e54 | 31 | BUG_ON(image->size % PAGE_SIZE != 0); |
1a21d4e0 | 32 | |
6f121e54 AL |
33 | apply_alternatives((struct alt_instr *)(image->data + image->alt), |
34 | (struct alt_instr *)(image->data + image->alt + | |
35 | image->alt_len)); | |
1a21d4e0 | 36 | } |
1b3f2a72 | 37 | |
2aae950b AK |
38 | struct linux_binprm; |
39 | ||
05ef76b2 AL |
40 | static int vdso_fault(const struct vm_special_mapping *sm, |
41 | struct vm_area_struct *vma, struct vm_fault *vmf) | |
42 | { | |
43 | const struct vdso_image *image = vma->vm_mm->context.vdso_image; | |
44 | ||
45 | if (!image || (vmf->pgoff << PAGE_SHIFT) >= image->size) | |
46 | return VM_FAULT_SIGBUS; | |
47 | ||
48 | vmf->page = virt_to_page(image->data + (vmf->pgoff << PAGE_SHIFT)); | |
49 | get_page(vmf->page); | |
50 | return 0; | |
51 | } | |
52 | ||
b059a453 DS |
53 | static void vdso_fix_landing(const struct vdso_image *image, |
54 | struct vm_area_struct *new_vma) | |
55 | { | |
56 | #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION | |
57 | if (in_ia32_syscall() && image == &vdso_image_32) { | |
58 | struct pt_regs *regs = current_pt_regs(); | |
59 | unsigned long vdso_land = image->sym_int80_landing_pad; | |
60 | unsigned long old_land_addr = vdso_land + | |
61 | (unsigned long)current->mm->context.vdso; | |
62 | ||
63 | /* Fixing userspace landing - look at do_fast_syscall_32 */ | |
64 | if (regs->ip == old_land_addr) | |
65 | regs->ip = new_vma->vm_start + vdso_land; | |
66 | } | |
67 | #endif | |
68 | } | |
69 | ||
70 | static int vdso_mremap(const struct vm_special_mapping *sm, | |
71 | struct vm_area_struct *new_vma) | |
72 | { | |
73 | unsigned long new_size = new_vma->vm_end - new_vma->vm_start; | |
74 | const struct vdso_image *image = current->mm->context.vdso_image; | |
75 | ||
76 | if (image->size != new_size) | |
77 | return -EINVAL; | |
78 | ||
79 | if (WARN_ON_ONCE(current->mm != new_vma->vm_mm)) | |
80 | return -EFAULT; | |
81 | ||
82 | vdso_fix_landing(image, new_vma); | |
83 | current->mm->context.vdso = (void __user *)new_vma->vm_start; | |
84 | ||
85 | return 0; | |
86 | } | |
05ef76b2 | 87 | |
a48a7042 AL |
88 | static int vvar_fault(const struct vm_special_mapping *sm, |
89 | struct vm_area_struct *vma, struct vm_fault *vmf) | |
90 | { | |
91 | const struct vdso_image *image = vma->vm_mm->context.vdso_image; | |
92 | long sym_offset; | |
93 | int ret = -EFAULT; | |
94 | ||
95 | if (!image) | |
96 | return VM_FAULT_SIGBUS; | |
97 | ||
98 | sym_offset = (long)(vmf->pgoff << PAGE_SHIFT) + | |
99 | image->sym_vvar_start; | |
100 | ||
101 | /* | |
102 | * Sanity check: a symbol offset of zero means that the page | |
103 | * does not exist for this vdso image, not that the page is at | |
104 | * offset zero relative to the text mapping. This should be | |
105 | * impossible here, because sym_offset should only be zero for | |
106 | * the page past the end of the vvar mapping. | |
107 | */ | |
108 | if (sym_offset == 0) | |
109 | return VM_FAULT_SIGBUS; | |
110 | ||
111 | if (sym_offset == image->sym_vvar_page) { | |
1a29d85e | 112 | ret = vm_insert_pfn(vma, vmf->address, |
a48a7042 | 113 | __pa_symbol(&__vvar_page) >> PAGE_SHIFT); |
a48a7042 AL |
114 | } else if (sym_offset == image->sym_pvclock_page) { |
115 | struct pvclock_vsyscall_time_info *pvti = | |
116 | pvclock_pvti_cpu0_va(); | |
bd902c53 | 117 | if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) { |
a48a7042 AL |
118 | ret = vm_insert_pfn( |
119 | vma, | |
1a29d85e | 120 | vmf->address, |
a48a7042 AL |
121 | __pa(pvti) >> PAGE_SHIFT); |
122 | } | |
123 | } | |
124 | ||
125 | if (ret == 0 || ret == -EBUSY) | |
126 | return VM_FAULT_NOPAGE; | |
127 | ||
128 | return VM_FAULT_SIGBUS; | |
129 | } | |
130 | ||
2eefd878 DS |
131 | static const struct vm_special_mapping vdso_mapping = { |
132 | .name = "[vdso]", | |
133 | .fault = vdso_fault, | |
134 | .mremap = vdso_mremap, | |
135 | }; | |
136 | static const struct vm_special_mapping vvar_mapping = { | |
137 | .name = "[vvar]", | |
138 | .fault = vvar_fault, | |
139 | }; | |
140 | ||
576ebfef DS |
141 | /* |
142 | * Add vdso and vvar mappings to current process. | |
143 | * @image - blob to map | |
144 | * @addr - request a specific address (zero to map at free addr) | |
145 | */ | |
146 | static int map_vdso(const struct vdso_image *image, unsigned long addr) | |
2aae950b AK |
147 | { |
148 | struct mm_struct *mm = current->mm; | |
18d0a6fd | 149 | struct vm_area_struct *vma; |
576ebfef | 150 | unsigned long text_start; |
18d0a6fd | 151 | int ret = 0; |
b059a453 | 152 | |
69048176 MH |
153 | if (down_write_killable(&mm->mmap_sem)) |
154 | return -EINTR; | |
18d0a6fd | 155 | |
e6577a7c AL |
156 | addr = get_unmapped_area(NULL, addr, |
157 | image->size - image->sym_vvar_start, 0, 0); | |
2aae950b AK |
158 | if (IS_ERR_VALUE(addr)) { |
159 | ret = addr; | |
160 | goto up_fail; | |
161 | } | |
162 | ||
e6577a7c | 163 | text_start = addr - image->sym_vvar_start; |
f7b6eb3f | 164 | |
18d0a6fd AL |
165 | /* |
166 | * MAYWRITE to allow gdb to COW and set breakpoints | |
167 | */ | |
a62c34bd | 168 | vma = _install_special_mapping(mm, |
e6577a7c | 169 | text_start, |
a62c34bd AL |
170 | image->size, |
171 | VM_READ|VM_EXEC| | |
172 | VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, | |
b059a453 | 173 | &vdso_mapping); |
18d0a6fd | 174 | |
a62c34bd AL |
175 | if (IS_ERR(vma)) { |
176 | ret = PTR_ERR(vma); | |
18d0a6fd | 177 | goto up_fail; |
a62c34bd | 178 | } |
18d0a6fd AL |
179 | |
180 | vma = _install_special_mapping(mm, | |
e6577a7c AL |
181 | addr, |
182 | -image->sym_vvar_start, | |
a48a7042 AL |
183 | VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP| |
184 | VM_PFNMAP, | |
a62c34bd | 185 | &vvar_mapping); |
18d0a6fd AL |
186 | |
187 | if (IS_ERR(vma)) { | |
188 | ret = PTR_ERR(vma); | |
e38447ee | 189 | do_munmap(mm, text_start, image->size); |
67dece7d DS |
190 | } else { |
191 | current->mm->context.vdso = (void __user *)text_start; | |
192 | current->mm->context.vdso_image = image; | |
f7b6eb3f | 193 | } |
2aae950b | 194 | |
2aae950b AK |
195 | up_fail: |
196 | up_write(&mm->mmap_sem); | |
197 | return ret; | |
198 | } | |
199 | ||
3947f493 IM |
200 | #ifdef CONFIG_X86_64 |
201 | /* | |
202 | * Put the vdso above the (randomized) stack with another randomized | |
203 | * offset. This way there is no hole in the middle of address space. | |
204 | * To save memory make sure it is still in the same PTE as the stack | |
205 | * top. This doesn't give that many random bits. | |
206 | * | |
207 | * Note that this algorithm is imperfect: the distribution of the vdso | |
208 | * start address within a PMD is biased toward the end. | |
209 | * | |
210 | * Only used for the 64-bit and x32 vdsos. | |
211 | */ | |
212 | static unsigned long vdso_addr(unsigned long start, unsigned len) | |
213 | { | |
214 | unsigned long addr, end; | |
215 | unsigned offset; | |
216 | ||
217 | /* | |
218 | * Round up the start address. It can start out unaligned as a result | |
219 | * of stack start randomization. | |
220 | */ | |
221 | start = PAGE_ALIGN(start); | |
222 | ||
223 | /* Round the lowest possible end address up to a PMD boundary. */ | |
224 | end = (start + len + PMD_SIZE - 1) & PMD_MASK; | |
225 | if (end >= TASK_SIZE_MAX) | |
226 | end = TASK_SIZE_MAX; | |
227 | end -= len; | |
228 | ||
229 | if (end > start) { | |
230 | offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1); | |
231 | addr = start + (offset << PAGE_SHIFT); | |
232 | } else { | |
233 | addr = start; | |
234 | } | |
235 | ||
236 | /* | |
237 | * Forcibly align the final address in case we have a hardware | |
238 | * issue that requires alignment for performance reasons. | |
239 | */ | |
240 | addr = align_vdso_addr(addr); | |
241 | ||
242 | return addr; | |
243 | } | |
244 | ||
576ebfef DS |
245 | static int map_vdso_randomized(const struct vdso_image *image) |
246 | { | |
3947f493 IM |
247 | unsigned long addr = vdso_addr(current->mm->start_stack, image->size-image->sym_vvar_start); |
248 | ||
576ebfef DS |
249 | return map_vdso(image, addr); |
250 | } | |
3947f493 | 251 | #endif |
576ebfef | 252 | |
2eefd878 DS |
253 | int map_vdso_once(const struct vdso_image *image, unsigned long addr) |
254 | { | |
255 | struct mm_struct *mm = current->mm; | |
256 | struct vm_area_struct *vma; | |
257 | ||
258 | down_write(&mm->mmap_sem); | |
259 | /* | |
260 | * Check if we have already mapped vdso blob - fail to prevent | |
261 | * abusing from userspace install_speciall_mapping, which may | |
262 | * not do accounting and rlimit right. | |
263 | * We could search vma near context.vdso, but it's a slowpath, | |
264 | * so let's explicitely check all VMAs to be completely sure. | |
265 | */ | |
266 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | |
267 | if (vma_is_special_mapping(vma, &vdso_mapping) || | |
268 | vma_is_special_mapping(vma, &vvar_mapping)) { | |
269 | up_write(&mm->mmap_sem); | |
270 | return -EEXIST; | |
271 | } | |
272 | } | |
273 | up_write(&mm->mmap_sem); | |
274 | ||
275 | return map_vdso(image, addr); | |
276 | } | |
277 | ||
ab8b82ee | 278 | #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) |
18d0a6fd AL |
279 | static int load_vdso32(void) |
280 | { | |
18d0a6fd AL |
281 | if (vdso32_enabled != 1) /* Other values all mean "disabled" */ |
282 | return 0; | |
283 | ||
576ebfef | 284 | return map_vdso(&vdso_image_32, 0); |
18d0a6fd AL |
285 | } |
286 | #endif | |
287 | ||
288 | #ifdef CONFIG_X86_64 | |
1a21d4e0 L |
289 | int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) |
290 | { | |
18d0a6fd AL |
291 | if (!vdso64_enabled) |
292 | return 0; | |
293 | ||
576ebfef | 294 | return map_vdso_randomized(&vdso_image_64); |
1a21d4e0 L |
295 | } |
296 | ||
18d0a6fd AL |
297 | #ifdef CONFIG_COMPAT |
298 | int compat_arch_setup_additional_pages(struct linux_binprm *bprm, | |
299 | int uses_interp) | |
300 | { | |
1a21d4e0 | 301 | #ifdef CONFIG_X86_X32_ABI |
18d0a6fd AL |
302 | if (test_thread_flag(TIF_X32)) { |
303 | if (!vdso64_enabled) | |
304 | return 0; | |
576ebfef | 305 | return map_vdso_randomized(&vdso_image_x32); |
18d0a6fd AL |
306 | } |
307 | #endif | |
ab8b82ee | 308 | #ifdef CONFIG_IA32_EMULATION |
18d0a6fd | 309 | return load_vdso32(); |
ab8b82ee BG |
310 | #else |
311 | return 0; | |
312 | #endif | |
18d0a6fd AL |
313 | } |
314 | #endif | |
315 | #else | |
316 | int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) | |
1a21d4e0 | 317 | { |
18d0a6fd | 318 | return load_vdso32(); |
1a21d4e0 L |
319 | } |
320 | #endif | |
321 | ||
18d0a6fd | 322 | #ifdef CONFIG_X86_64 |
2aae950b AK |
323 | static __init int vdso_setup(char *s) |
324 | { | |
3d7ee969 | 325 | vdso64_enabled = simple_strtoul(s, NULL, 0); |
2aae950b AK |
326 | return 0; |
327 | } | |
328 | __setup("vdso=", vdso_setup); | |
b4b541a6 | 329 | #endif |
d4f829dd AL |
330 | |
331 | #ifdef CONFIG_X86_64 | |
1c0c1b93 | 332 | static void vgetcpu_cpu_init(void *arg) |
d4f829dd | 333 | { |
1c0c1b93 | 334 | int cpu = smp_processor_id(); |
a92f101b | 335 | struct desc_struct d = { }; |
d4f829dd AL |
336 | unsigned long node = 0; |
337 | #ifdef CONFIG_NUMA | |
338 | node = cpu_to_node(cpu); | |
339 | #endif | |
8c725306 | 340 | if (static_cpu_has(X86_FEATURE_RDTSCP)) |
d4f829dd AL |
341 | write_rdtscp_aux((node << 12) | cpu); |
342 | ||
343 | /* | |
25880156 AL |
344 | * Store cpu number in limit so that it can be loaded |
345 | * quickly in user space in vgetcpu. (12 bits for the CPU | |
346 | * and 8 bits for the node) | |
d4f829dd | 347 | */ |
a92f101b AM |
348 | d.limit0 = cpu | ((node & 0xf) << 12); |
349 | d.limit = node >> 4; | |
350 | d.type = 5; /* RO data, expand down, accessed */ | |
351 | d.dpl = 3; /* Visible to user code */ | |
352 | d.s = 1; /* Not a system segment */ | |
353 | d.p = 1; /* Present */ | |
354 | d.d = 1; /* 32-bit */ | |
d4f829dd AL |
355 | |
356 | write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); | |
357 | } | |
358 | ||
07d36c9e | 359 | static int vgetcpu_online(unsigned int cpu) |
d4f829dd | 360 | { |
07d36c9e | 361 | return smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1); |
d4f829dd AL |
362 | } |
363 | ||
1c0c1b93 | 364 | static int __init init_vdso(void) |
d4f829dd | 365 | { |
1c0c1b93 AL |
366 | init_vdso_image(&vdso_image_64); |
367 | ||
368 | #ifdef CONFIG_X86_X32_ABI | |
369 | init_vdso_image(&vdso_image_x32); | |
370 | #endif | |
371 | ||
d4f829dd | 372 | /* notifier priority > KVM */ |
07d36c9e | 373 | return cpuhp_setup_state(CPUHP_AP_X86_VDSO_VMA_ONLINE, |
73c1b41e | 374 | "x86/vdso/vma:online", vgetcpu_online, NULL); |
d4f829dd | 375 | } |
1c0c1b93 AL |
376 | subsys_initcall(init_vdso); |
377 | #endif /* CONFIG_X86_64 */ |