]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - arch/x86/kernel/process_64.c
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
[mirror_ubuntu-bionic-kernel.git] / arch / x86 / kernel / process_64.c
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * X86-64 port
8 * Andi Kleen.
9 *
10 * CPU hotplug support - ashok.raj@intel.com
11 */
12
13 /*
14 * This file handles the architecture-dependent parts of process handling..
15 */
16
17 #include <linux/cpu.h>
18 #include <linux/errno.h>
19 #include <linux/sched.h>
20 #include <linux/fs.h>
21 #include <linux/kernel.h>
22 #include <linux/mm.h>
23 #include <linux/elfcore.h>
24 #include <linux/smp.h>
25 #include <linux/slab.h>
26 #include <linux/user.h>
27 #include <linux/interrupt.h>
28 #include <linux/delay.h>
29 #include <linux/module.h>
30 #include <linux/ptrace.h>
31 #include <linux/notifier.h>
32 #include <linux/kprobes.h>
33 #include <linux/kdebug.h>
34 #include <linux/prctl.h>
35 #include <linux/uaccess.h>
36 #include <linux/io.h>
37 #include <linux/ftrace.h>
38
39 #include <asm/pgtable.h>
40 #include <asm/processor.h>
41 #include <asm/i387.h>
42 #include <asm/fpu-internal.h>
43 #include <asm/mmu_context.h>
44 #include <asm/prctl.h>
45 #include <asm/desc.h>
46 #include <asm/proto.h>
47 #include <asm/ia32.h>
48 #include <asm/idle.h>
49 #include <asm/syscalls.h>
50 #include <asm/debugreg.h>
51 #include <asm/switch_to.h>
52
53 asmlinkage extern void ret_from_fork(void);
54
55 DEFINE_PER_CPU(unsigned long, old_rsp);
56
57 /* Prints also some state that isn't saved in the pt_regs */
58 void __show_regs(struct pt_regs *regs, int all)
59 {
60 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
61 unsigned long d0, d1, d2, d3, d6, d7;
62 unsigned int fsindex, gsindex;
63 unsigned int ds, cs, es;
64
65 show_regs_common();
66 printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
67 printk_address(regs->ip, 1);
68 printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
69 regs->sp, regs->flags);
70 printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
71 regs->ax, regs->bx, regs->cx);
72 printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
73 regs->dx, regs->si, regs->di);
74 printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
75 regs->bp, regs->r8, regs->r9);
76 printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
77 regs->r10, regs->r11, regs->r12);
78 printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
79 regs->r13, regs->r14, regs->r15);
80
81 asm("movl %%ds,%0" : "=r" (ds));
82 asm("movl %%cs,%0" : "=r" (cs));
83 asm("movl %%es,%0" : "=r" (es));
84 asm("movl %%fs,%0" : "=r" (fsindex));
85 asm("movl %%gs,%0" : "=r" (gsindex));
86
87 rdmsrl(MSR_FS_BASE, fs);
88 rdmsrl(MSR_GS_BASE, gs);
89 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
90
91 if (!all)
92 return;
93
94 cr0 = read_cr0();
95 cr2 = read_cr2();
96 cr3 = read_cr3();
97 cr4 = read_cr4();
98
99 printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
100 fs, fsindex, gs, gsindex, shadowgs);
101 printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
102 es, cr0);
103 printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
104 cr4);
105
106 get_debugreg(d0, 0);
107 get_debugreg(d1, 1);
108 get_debugreg(d2, 2);
109 printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
110 get_debugreg(d3, 3);
111 get_debugreg(d6, 6);
112 get_debugreg(d7, 7);
113 printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
114 }
115
116 void release_thread(struct task_struct *dead_task)
117 {
118 if (dead_task->mm) {
119 if (dead_task->mm->context.size) {
120 pr_warn("WARNING: dead process %8s still has LDT? <%p/%d>\n",
121 dead_task->comm,
122 dead_task->mm->context.ldt,
123 dead_task->mm->context.size);
124 BUG();
125 }
126 }
127 }
128
129 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
130 {
131 struct user_desc ud = {
132 .base_addr = addr,
133 .limit = 0xfffff,
134 .seg_32bit = 1,
135 .limit_in_pages = 1,
136 .useable = 1,
137 };
138 struct desc_struct *desc = t->thread.tls_array;
139 desc += tls;
140 fill_ldt(desc, &ud);
141 }
142
143 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
144 {
145 return get_desc_base(&t->thread.tls_array[tls]);
146 }
147
148 int copy_thread(unsigned long clone_flags, unsigned long sp,
149 unsigned long arg,
150 struct task_struct *p, struct pt_regs *regs)
151 {
152 int err;
153 struct pt_regs *childregs;
154 struct task_struct *me = current;
155
156 p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
157 childregs = task_pt_regs(p);
158 p->thread.sp = (unsigned long) childregs;
159 p->thread.usersp = me->thread.usersp;
160 set_tsk_thread_flag(p, TIF_FORK);
161 p->fpu_counter = 0;
162 p->thread.io_bitmap_ptr = NULL;
163
164 savesegment(gs, p->thread.gsindex);
165 p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
166 savesegment(fs, p->thread.fsindex);
167 p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
168 savesegment(es, p->thread.es);
169 savesegment(ds, p->thread.ds);
170 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
171
172 if (unlikely(!regs)) {
173 /* kernel thread */
174 memset(childregs, 0, sizeof(struct pt_regs));
175 childregs->sp = (unsigned long)childregs;
176 childregs->ss = __KERNEL_DS;
177 childregs->bx = sp; /* function */
178 childregs->bp = arg;
179 childregs->orig_ax = -1;
180 childregs->cs = __KERNEL_CS | get_kernel_rpl();
181 childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;
182 return 0;
183 }
184 *childregs = *regs;
185
186 childregs->ax = 0;
187 childregs->sp = sp;
188
189 err = -ENOMEM;
190 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
191
192 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
193 p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
194 IO_BITMAP_BYTES, GFP_KERNEL);
195 if (!p->thread.io_bitmap_ptr) {
196 p->thread.io_bitmap_max = 0;
197 return -ENOMEM;
198 }
199 set_tsk_thread_flag(p, TIF_IO_BITMAP);
200 }
201
202 /*
203 * Set a new TLS for the child thread?
204 */
205 if (clone_flags & CLONE_SETTLS) {
206 #ifdef CONFIG_IA32_EMULATION
207 if (test_thread_flag(TIF_IA32))
208 err = do_set_thread_area(p, -1,
209 (struct user_desc __user *)childregs->si, 0);
210 else
211 #endif
212 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
213 if (err)
214 goto out;
215 }
216 err = 0;
217 out:
218 if (err && p->thread.io_bitmap_ptr) {
219 kfree(p->thread.io_bitmap_ptr);
220 p->thread.io_bitmap_max = 0;
221 }
222
223 return err;
224 }
225
226 static void
227 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
228 unsigned long new_sp,
229 unsigned int _cs, unsigned int _ss, unsigned int _ds)
230 {
231 loadsegment(fs, 0);
232 loadsegment(es, _ds);
233 loadsegment(ds, _ds);
234 load_gs_index(0);
235 current->thread.usersp = new_sp;
236 regs->ip = new_ip;
237 regs->sp = new_sp;
238 this_cpu_write(old_rsp, new_sp);
239 regs->cs = _cs;
240 regs->ss = _ss;
241 regs->flags = X86_EFLAGS_IF;
242 }
243
244 void
245 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
246 {
247 start_thread_common(regs, new_ip, new_sp,
248 __USER_CS, __USER_DS, 0);
249 }
250
251 #ifdef CONFIG_IA32_EMULATION
252 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
253 {
254 start_thread_common(regs, new_ip, new_sp,
255 test_thread_flag(TIF_X32)
256 ? __USER_CS : __USER32_CS,
257 __USER_DS, __USER_DS);
258 }
259 #endif
260
261 /*
262 * switch_to(x,y) should switch tasks from x to y.
263 *
264 * This could still be optimized:
265 * - fold all the options into a flag word and test it with a single test.
266 * - could test fs/gs bitsliced
267 *
268 * Kprobes not supported here. Set the probe on schedule instead.
269 * Function graph tracer not supported too.
270 */
271 __notrace_funcgraph struct task_struct *
272 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
273 {
274 struct thread_struct *prev = &prev_p->thread;
275 struct thread_struct *next = &next_p->thread;
276 int cpu = smp_processor_id();
277 struct tss_struct *tss = &per_cpu(init_tss, cpu);
278 unsigned fsindex, gsindex;
279 fpu_switch_t fpu;
280
281 fpu = switch_fpu_prepare(prev_p, next_p, cpu);
282
283 /*
284 * Reload esp0, LDT and the page table pointer:
285 */
286 load_sp0(tss, next);
287
288 /*
289 * Switch DS and ES.
290 * This won't pick up thread selector changes, but I guess that is ok.
291 */
292 savesegment(es, prev->es);
293 if (unlikely(next->es | prev->es))
294 loadsegment(es, next->es);
295
296 savesegment(ds, prev->ds);
297 if (unlikely(next->ds | prev->ds))
298 loadsegment(ds, next->ds);
299
300
301 /* We must save %fs and %gs before load_TLS() because
302 * %fs and %gs may be cleared by load_TLS().
303 *
304 * (e.g. xen_load_tls())
305 */
306 savesegment(fs, fsindex);
307 savesegment(gs, gsindex);
308
309 load_TLS(next, cpu);
310
311 /*
312 * Leave lazy mode, flushing any hypercalls made here.
313 * This must be done before restoring TLS segments so
314 * the GDT and LDT are properly updated, and must be
315 * done before math_state_restore, so the TS bit is up
316 * to date.
317 */
318 arch_end_context_switch(next_p);
319
320 /*
321 * Switch FS and GS.
322 *
323 * Segment register != 0 always requires a reload. Also
324 * reload when it has changed. When prev process used 64bit
325 * base always reload to avoid an information leak.
326 */
327 if (unlikely(fsindex | next->fsindex | prev->fs)) {
328 loadsegment(fs, next->fsindex);
329 /*
330 * Check if the user used a selector != 0; if yes
331 * clear 64bit base, since overloaded base is always
332 * mapped to the Null selector
333 */
334 if (fsindex)
335 prev->fs = 0;
336 }
337 /* when next process has a 64bit base use it */
338 if (next->fs)
339 wrmsrl(MSR_FS_BASE, next->fs);
340 prev->fsindex = fsindex;
341
342 if (unlikely(gsindex | next->gsindex | prev->gs)) {
343 load_gs_index(next->gsindex);
344 if (gsindex)
345 prev->gs = 0;
346 }
347 if (next->gs)
348 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
349 prev->gsindex = gsindex;
350
351 switch_fpu_finish(next_p, fpu);
352
353 /*
354 * Switch the PDA and FPU contexts.
355 */
356 prev->usersp = this_cpu_read(old_rsp);
357 this_cpu_write(old_rsp, next->usersp);
358 this_cpu_write(current_task, next_p);
359
360 this_cpu_write(kernel_stack,
361 (unsigned long)task_stack_page(next_p) +
362 THREAD_SIZE - KERNEL_STACK_OFFSET);
363
364 /*
365 * Now maybe reload the debug registers and handle I/O bitmaps
366 */
367 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
368 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
369 __switch_to_xtra(prev_p, next_p, tss);
370
371 return prev_p;
372 }
373
374 void set_personality_64bit(void)
375 {
376 /* inherit personality from parent */
377
378 /* Make sure to be in 64bit mode */
379 clear_thread_flag(TIF_IA32);
380 clear_thread_flag(TIF_ADDR32);
381 clear_thread_flag(TIF_X32);
382
383 /* Ensure the corresponding mm is not marked. */
384 if (current->mm)
385 current->mm->context.ia32_compat = 0;
386
387 /* TBD: overwrites user setup. Should have two bits.
388 But 64bit processes have always behaved this way,
389 so it's not too bad. The main problem is just that
390 32bit childs are affected again. */
391 current->personality &= ~READ_IMPLIES_EXEC;
392 }
393
394 void set_personality_ia32(bool x32)
395 {
396 /* inherit personality from parent */
397
398 /* Make sure to be in 32bit mode */
399 set_thread_flag(TIF_ADDR32);
400
401 /* Mark the associated mm as containing 32-bit tasks. */
402 if (current->mm)
403 current->mm->context.ia32_compat = 1;
404
405 if (x32) {
406 clear_thread_flag(TIF_IA32);
407 set_thread_flag(TIF_X32);
408 current->personality &= ~READ_IMPLIES_EXEC;
409 /* is_compat_task() uses the presence of the x32
410 syscall bit flag to determine compat status */
411 current_thread_info()->status &= ~TS_COMPAT;
412 } else {
413 set_thread_flag(TIF_IA32);
414 clear_thread_flag(TIF_X32);
415 current->personality |= force_personality32;
416 /* Prepare the first "return" to user space */
417 current_thread_info()->status |= TS_COMPAT;
418 }
419 }
420 EXPORT_SYMBOL_GPL(set_personality_ia32);
421
422 unsigned long get_wchan(struct task_struct *p)
423 {
424 unsigned long stack;
425 u64 fp, ip;
426 int count = 0;
427
428 if (!p || p == current || p->state == TASK_RUNNING)
429 return 0;
430 stack = (unsigned long)task_stack_page(p);
431 if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
432 return 0;
433 fp = *(u64 *)(p->thread.sp);
434 do {
435 if (fp < (unsigned long)stack ||
436 fp >= (unsigned long)stack+THREAD_SIZE)
437 return 0;
438 ip = *(u64 *)(fp+8);
439 if (!in_sched_functions(ip))
440 return ip;
441 fp = *(u64 *)fp;
442 } while (count++ < 16);
443 return 0;
444 }
445
446 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
447 {
448 int ret = 0;
449 int doit = task == current;
450 int cpu;
451
452 switch (code) {
453 case ARCH_SET_GS:
454 if (addr >= TASK_SIZE_OF(task))
455 return -EPERM;
456 cpu = get_cpu();
457 /* handle small bases via the GDT because that's faster to
458 switch. */
459 if (addr <= 0xffffffff) {
460 set_32bit_tls(task, GS_TLS, addr);
461 if (doit) {
462 load_TLS(&task->thread, cpu);
463 load_gs_index(GS_TLS_SEL);
464 }
465 task->thread.gsindex = GS_TLS_SEL;
466 task->thread.gs = 0;
467 } else {
468 task->thread.gsindex = 0;
469 task->thread.gs = addr;
470 if (doit) {
471 load_gs_index(0);
472 ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);
473 }
474 }
475 put_cpu();
476 break;
477 case ARCH_SET_FS:
478 /* Not strictly needed for fs, but do it for symmetry
479 with gs */
480 if (addr >= TASK_SIZE_OF(task))
481 return -EPERM;
482 cpu = get_cpu();
483 /* handle small bases via the GDT because that's faster to
484 switch. */
485 if (addr <= 0xffffffff) {
486 set_32bit_tls(task, FS_TLS, addr);
487 if (doit) {
488 load_TLS(&task->thread, cpu);
489 loadsegment(fs, FS_TLS_SEL);
490 }
491 task->thread.fsindex = FS_TLS_SEL;
492 task->thread.fs = 0;
493 } else {
494 task->thread.fsindex = 0;
495 task->thread.fs = addr;
496 if (doit) {
497 /* set the selector to 0 to not confuse
498 __switch_to */
499 loadsegment(fs, 0);
500 ret = wrmsrl_safe(MSR_FS_BASE, addr);
501 }
502 }
503 put_cpu();
504 break;
505 case ARCH_GET_FS: {
506 unsigned long base;
507 if (task->thread.fsindex == FS_TLS_SEL)
508 base = read_32bit_tls(task, FS_TLS);
509 else if (doit)
510 rdmsrl(MSR_FS_BASE, base);
511 else
512 base = task->thread.fs;
513 ret = put_user(base, (unsigned long __user *)addr);
514 break;
515 }
516 case ARCH_GET_GS: {
517 unsigned long base;
518 unsigned gsindex;
519 if (task->thread.gsindex == GS_TLS_SEL)
520 base = read_32bit_tls(task, GS_TLS);
521 else if (doit) {
522 savesegment(gs, gsindex);
523 if (gsindex)
524 rdmsrl(MSR_KERNEL_GS_BASE, base);
525 else
526 base = task->thread.gs;
527 } else
528 base = task->thread.gs;
529 ret = put_user(base, (unsigned long __user *)addr);
530 break;
531 }
532
533 default:
534 ret = -EINVAL;
535 break;
536 }
537
538 return ret;
539 }
540
541 long sys_arch_prctl(int code, unsigned long addr)
542 {
543 return do_arch_prctl(current, code, addr);
544 }
545
546 unsigned long KSTK_ESP(struct task_struct *task)
547 {
548 return (test_tsk_thread_flag(task, TIF_IA32)) ?
549 (task_pt_regs(task)->sp) : ((task)->thread.usersp);
550 }