]>
git.proxmox.com Git - mirror_ubuntu-focal-kernel.git/blob - arch/x86/kernel/process_64.c
2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
19 #include <linux/stackprotector.h>
20 #include <linux/cpu.h>
21 #include <linux/errno.h>
22 #include <linux/sched.h>
24 #include <linux/kernel.h>
26 #include <linux/elfcore.h>
27 #include <linux/smp.h>
28 #include <linux/slab.h>
29 #include <linux/user.h>
30 #include <linux/interrupt.h>
31 #include <linux/utsname.h>
32 #include <linux/delay.h>
33 #include <linux/module.h>
34 #include <linux/ptrace.h>
35 #include <linux/random.h>
36 #include <linux/notifier.h>
37 #include <linux/kprobes.h>
38 #include <linux/kdebug.h>
39 #include <linux/tick.h>
40 #include <linux/prctl.h>
41 #include <linux/uaccess.h>
43 #include <linux/ftrace.h>
44 #include <linux/dmi.h>
46 #include <asm/pgtable.h>
47 #include <asm/system.h>
48 #include <asm/processor.h>
50 #include <asm/mmu_context.h>
51 #include <asm/prctl.h>
53 #include <asm/proto.h>
56 #include <asm/syscalls.h>
59 asmlinkage
extern void ret_from_fork(void);
61 DEFINE_PER_CPU(struct task_struct
*, current_task
) = &init_task
;
62 EXPORT_PER_CPU_SYMBOL(current_task
);
64 DEFINE_PER_CPU(unsigned long, old_rsp
);
65 static DEFINE_PER_CPU(unsigned char, is_idle
);
67 unsigned long kernel_thread_flags
= CLONE_VM
| CLONE_UNTRACED
;
69 static ATOMIC_NOTIFIER_HEAD(idle_notifier
);
71 void idle_notifier_register(struct notifier_block
*n
)
73 atomic_notifier_chain_register(&idle_notifier
, n
);
75 EXPORT_SYMBOL_GPL(idle_notifier_register
);
77 void idle_notifier_unregister(struct notifier_block
*n
)
79 atomic_notifier_chain_unregister(&idle_notifier
, n
);
81 EXPORT_SYMBOL_GPL(idle_notifier_unregister
);
85 percpu_write(is_idle
, 1);
86 atomic_notifier_call_chain(&idle_notifier
, IDLE_START
, NULL
);
89 static void __exit_idle(void)
91 if (x86_test_and_clear_bit_percpu(0, is_idle
) == 0)
93 atomic_notifier_call_chain(&idle_notifier
, IDLE_END
, NULL
);
96 /* Called from interrupts to signify idle end */
99 /* idle loop has pid 0 */
106 static inline void play_dead(void)
113 * The idle thread. There's no useful work to be
114 * done, so just try to conserve power and have a
115 * low exit latency (ie sit in a loop waiting for
116 * somebody to say that they'd like to reschedule)
120 current_thread_info()->status
|= TS_POLLING
;
123 * If we're the non-boot CPU, nothing set the PDA stack
124 * canary up for us - and if we are the boot CPU we have
125 * a 0 stack canary. This is a good place for updating
126 * it, as we wont ever return from this function (so the
127 * invalid canaries already on the stack wont ever
130 boot_init_stack_canary();
132 /* endless idle loop with no priority at all */
134 tick_nohz_stop_sched_tick(1);
135 while (!need_resched()) {
139 if (cpu_is_offline(smp_processor_id()))
142 * Idle routines should keep interrupts disabled
143 * from here on, until they go to idle.
144 * Otherwise, idle callbacks can misfire.
148 /* Don't trace irqs off for idle */
149 stop_critical_timings();
151 start_critical_timings();
152 /* In many cases the interrupt that ended idle
153 has already called exit_idle. But some idle
154 loops can be woken up without interrupt. */
158 tick_nohz_restart_sched_tick();
159 preempt_enable_no_resched();
165 /* Prints also some state that isn't saved in the pt_regs */
166 void __show_regs(struct pt_regs
*regs
, int all
)
168 unsigned long cr0
= 0L, cr2
= 0L, cr3
= 0L, cr4
= 0L, fs
, gs
, shadowgs
;
169 unsigned long d0
, d1
, d2
, d3
, d6
, d7
;
170 unsigned int fsindex
, gsindex
;
171 unsigned int ds
, cs
, es
;
176 board
= dmi_get_system_info(DMI_PRODUCT_NAME
);
179 printk(KERN_INFO
"Pid: %d, comm: %.20s %s %s %.*s %s\n",
180 current
->pid
, current
->comm
, print_tainted(),
181 init_utsname()->release
,
182 (int)strcspn(init_utsname()->version
, " "),
183 init_utsname()->version
, board
);
184 printk(KERN_INFO
"RIP: %04lx:[<%016lx>] ", regs
->cs
& 0xffff, regs
->ip
);
185 printk_address(regs
->ip
, 1);
186 printk(KERN_INFO
"RSP: %04lx:%016lx EFLAGS: %08lx\n", regs
->ss
,
187 regs
->sp
, regs
->flags
);
188 printk(KERN_INFO
"RAX: %016lx RBX: %016lx RCX: %016lx\n",
189 regs
->ax
, regs
->bx
, regs
->cx
);
190 printk(KERN_INFO
"RDX: %016lx RSI: %016lx RDI: %016lx\n",
191 regs
->dx
, regs
->si
, regs
->di
);
192 printk(KERN_INFO
"RBP: %016lx R08: %016lx R09: %016lx\n",
193 regs
->bp
, regs
->r8
, regs
->r9
);
194 printk(KERN_INFO
"R10: %016lx R11: %016lx R12: %016lx\n",
195 regs
->r10
, regs
->r11
, regs
->r12
);
196 printk(KERN_INFO
"R13: %016lx R14: %016lx R15: %016lx\n",
197 regs
->r13
, regs
->r14
, regs
->r15
);
199 asm("movl %%ds,%0" : "=r" (ds
));
200 asm("movl %%cs,%0" : "=r" (cs
));
201 asm("movl %%es,%0" : "=r" (es
));
202 asm("movl %%fs,%0" : "=r" (fsindex
));
203 asm("movl %%gs,%0" : "=r" (gsindex
));
205 rdmsrl(MSR_FS_BASE
, fs
);
206 rdmsrl(MSR_GS_BASE
, gs
);
207 rdmsrl(MSR_KERNEL_GS_BASE
, shadowgs
);
217 printk(KERN_INFO
"FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
218 fs
, fsindex
, gs
, gsindex
, shadowgs
);
219 printk(KERN_INFO
"CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs
, ds
,
221 printk(KERN_INFO
"CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2
, cr3
,
227 printk(KERN_INFO
"DR0: %016lx DR1: %016lx DR2: %016lx\n", d0
, d1
, d2
);
231 printk(KERN_INFO
"DR3: %016lx DR6: %016lx DR7: %016lx\n", d3
, d6
, d7
);
234 void show_regs(struct pt_regs
*regs
)
236 printk(KERN_INFO
"CPU %d:", smp_processor_id());
237 __show_regs(regs
, 1);
238 show_trace(NULL
, regs
, (void *)(regs
+ 1), regs
->bp
);
242 * Free current thread data structures etc..
244 void exit_thread(void)
246 struct task_struct
*me
= current
;
247 struct thread_struct
*t
= &me
->thread
;
249 if (me
->thread
.io_bitmap_ptr
) {
250 struct tss_struct
*tss
= &per_cpu(init_tss
, get_cpu());
252 kfree(t
->io_bitmap_ptr
);
253 t
->io_bitmap_ptr
= NULL
;
254 clear_thread_flag(TIF_IO_BITMAP
);
256 * Careful, clear this in the TSS too:
258 memset(tss
->io_bitmap
, 0xff, t
->io_bitmap_max
);
259 t
->io_bitmap_max
= 0;
263 ds_exit_thread(current
);
266 void flush_thread(void)
268 struct task_struct
*tsk
= current
;
270 if (test_tsk_thread_flag(tsk
, TIF_ABI_PENDING
)) {
271 clear_tsk_thread_flag(tsk
, TIF_ABI_PENDING
);
272 if (test_tsk_thread_flag(tsk
, TIF_IA32
)) {
273 clear_tsk_thread_flag(tsk
, TIF_IA32
);
275 set_tsk_thread_flag(tsk
, TIF_IA32
);
276 current_thread_info()->status
|= TS_COMPAT
;
279 clear_tsk_thread_flag(tsk
, TIF_DEBUG
);
281 tsk
->thread
.debugreg0
= 0;
282 tsk
->thread
.debugreg1
= 0;
283 tsk
->thread
.debugreg2
= 0;
284 tsk
->thread
.debugreg3
= 0;
285 tsk
->thread
.debugreg6
= 0;
286 tsk
->thread
.debugreg7
= 0;
287 memset(tsk
->thread
.tls_array
, 0, sizeof(tsk
->thread
.tls_array
));
289 * Forget coprocessor state..
291 tsk
->fpu_counter
= 0;
296 void release_thread(struct task_struct
*dead_task
)
299 if (dead_task
->mm
->context
.size
) {
300 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
302 dead_task
->mm
->context
.ldt
,
303 dead_task
->mm
->context
.size
);
309 static inline void set_32bit_tls(struct task_struct
*t
, int tls
, u32 addr
)
311 struct user_desc ud
= {
318 struct desc_struct
*desc
= t
->thread
.tls_array
;
323 static inline u32
read_32bit_tls(struct task_struct
*t
, int tls
)
325 return get_desc_base(&t
->thread
.tls_array
[tls
]);
329 * This gets called before we allocate a new thread and copy
330 * the current task into it.
332 void prepare_to_copy(struct task_struct
*tsk
)
337 int copy_thread(int nr
, unsigned long clone_flags
, unsigned long sp
,
338 unsigned long unused
,
339 struct task_struct
*p
, struct pt_regs
*regs
)
342 struct pt_regs
*childregs
;
343 struct task_struct
*me
= current
;
345 childregs
= ((struct pt_regs
*)
346 (THREAD_SIZE
+ task_stack_page(p
))) - 1;
352 childregs
->sp
= (unsigned long)childregs
;
354 p
->thread
.sp
= (unsigned long) childregs
;
355 p
->thread
.sp0
= (unsigned long) (childregs
+1);
356 p
->thread
.usersp
= me
->thread
.usersp
;
358 set_tsk_thread_flag(p
, TIF_FORK
);
360 p
->thread
.fs
= me
->thread
.fs
;
361 p
->thread
.gs
= me
->thread
.gs
;
363 savesegment(gs
, p
->thread
.gsindex
);
364 savesegment(fs
, p
->thread
.fsindex
);
365 savesegment(es
, p
->thread
.es
);
366 savesegment(ds
, p
->thread
.ds
);
368 if (unlikely(test_tsk_thread_flag(me
, TIF_IO_BITMAP
))) {
369 p
->thread
.io_bitmap_ptr
= kmalloc(IO_BITMAP_BYTES
, GFP_KERNEL
);
370 if (!p
->thread
.io_bitmap_ptr
) {
371 p
->thread
.io_bitmap_max
= 0;
374 memcpy(p
->thread
.io_bitmap_ptr
, me
->thread
.io_bitmap_ptr
,
376 set_tsk_thread_flag(p
, TIF_IO_BITMAP
);
380 * Set a new TLS for the child thread?
382 if (clone_flags
& CLONE_SETTLS
) {
383 #ifdef CONFIG_IA32_EMULATION
384 if (test_thread_flag(TIF_IA32
))
385 err
= do_set_thread_area(p
, -1,
386 (struct user_desc __user
*)childregs
->si
, 0);
389 err
= do_arch_prctl(p
, ARCH_SET_FS
, childregs
->r8
);
394 ds_copy_thread(p
, me
);
396 clear_tsk_thread_flag(p
, TIF_DEBUGCTLMSR
);
397 p
->thread
.debugctlmsr
= 0;
401 if (err
&& p
->thread
.io_bitmap_ptr
) {
402 kfree(p
->thread
.io_bitmap_ptr
);
403 p
->thread
.io_bitmap_max
= 0;
409 start_thread(struct pt_regs
*regs
, unsigned long new_ip
, unsigned long new_sp
)
417 percpu_write(old_rsp
, new_sp
);
418 regs
->cs
= __USER_CS
;
419 regs
->ss
= __USER_DS
;
423 * Free the old FP and other extended state
425 free_thread_xstate(current
);
427 EXPORT_SYMBOL_GPL(start_thread
);
429 static void hard_disable_TSC(void)
431 write_cr4(read_cr4() | X86_CR4_TSD
);
434 void disable_TSC(void)
437 if (!test_and_set_thread_flag(TIF_NOTSC
))
439 * Must flip the CPU state synchronously with
440 * TIF_NOTSC in the current running context.
446 static void hard_enable_TSC(void)
448 write_cr4(read_cr4() & ~X86_CR4_TSD
);
451 static void enable_TSC(void)
454 if (test_and_clear_thread_flag(TIF_NOTSC
))
456 * Must flip the CPU state synchronously with
457 * TIF_NOTSC in the current running context.
463 int get_tsc_mode(unsigned long adr
)
467 if (test_thread_flag(TIF_NOTSC
))
468 val
= PR_TSC_SIGSEGV
;
472 return put_user(val
, (unsigned int __user
*)adr
);
475 int set_tsc_mode(unsigned int val
)
477 if (val
== PR_TSC_SIGSEGV
)
479 else if (val
== PR_TSC_ENABLE
)
488 * This special macro can be used to load a debugging register
490 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
492 static inline void __switch_to_xtra(struct task_struct
*prev_p
,
493 struct task_struct
*next_p
,
494 struct tss_struct
*tss
)
496 struct thread_struct
*prev
, *next
;
498 prev
= &prev_p
->thread
,
499 next
= &next_p
->thread
;
501 if (test_tsk_thread_flag(next_p
, TIF_DS_AREA_MSR
) ||
502 test_tsk_thread_flag(prev_p
, TIF_DS_AREA_MSR
))
503 ds_switch_to(prev_p
, next_p
);
504 else if (next
->debugctlmsr
!= prev
->debugctlmsr
)
505 update_debugctlmsr(next
->debugctlmsr
);
507 if (test_tsk_thread_flag(next_p
, TIF_DEBUG
)) {
517 if (test_tsk_thread_flag(prev_p
, TIF_NOTSC
) ^
518 test_tsk_thread_flag(next_p
, TIF_NOTSC
)) {
519 /* prev and next are different */
520 if (test_tsk_thread_flag(next_p
, TIF_NOTSC
))
526 if (test_tsk_thread_flag(next_p
, TIF_IO_BITMAP
)) {
528 * Copy the relevant range of the IO bitmap.
529 * Normally this is 128 bytes or less:
531 memcpy(tss
->io_bitmap
, next
->io_bitmap_ptr
,
532 max(prev
->io_bitmap_max
, next
->io_bitmap_max
));
533 } else if (test_tsk_thread_flag(prev_p
, TIF_IO_BITMAP
)) {
535 * Clear any possible leftover bits:
537 memset(tss
->io_bitmap
, 0xff, prev
->io_bitmap_max
);
542 * switch_to(x,y) should switch tasks from x to y.
544 * This could still be optimized:
545 * - fold all the options into a flag word and test it with a single test.
546 * - could test fs/gs bitsliced
548 * Kprobes not supported here. Set the probe on schedule instead.
549 * Function graph tracer not supported too.
551 __notrace_funcgraph
struct task_struct
*
552 __switch_to(struct task_struct
*prev_p
, struct task_struct
*next_p
)
554 struct thread_struct
*prev
= &prev_p
->thread
;
555 struct thread_struct
*next
= &next_p
->thread
;
556 int cpu
= smp_processor_id();
557 struct tss_struct
*tss
= &per_cpu(init_tss
, cpu
);
558 unsigned fsindex
, gsindex
;
560 /* we're going to use this soon, after a few expensive things */
561 if (next_p
->fpu_counter
> 5)
562 prefetch(next
->xstate
);
565 * Reload esp0, LDT and the page table pointer:
571 * This won't pick up thread selector changes, but I guess that is ok.
573 savesegment(es
, prev
->es
);
574 if (unlikely(next
->es
| prev
->es
))
575 loadsegment(es
, next
->es
);
577 savesegment(ds
, prev
->ds
);
578 if (unlikely(next
->ds
| prev
->ds
))
579 loadsegment(ds
, next
->ds
);
582 /* We must save %fs and %gs before load_TLS() because
583 * %fs and %gs may be cleared by load_TLS().
585 * (e.g. xen_load_tls())
587 savesegment(fs
, fsindex
);
588 savesegment(gs
, gsindex
);
593 * Leave lazy mode, flushing any hypercalls made here.
594 * This must be done before restoring TLS segments so
595 * the GDT and LDT are properly updated, and must be
596 * done before math_state_restore, so the TS bit is up
599 arch_leave_lazy_cpu_mode();
604 * Segment register != 0 always requires a reload. Also
605 * reload when it has changed. When prev process used 64bit
606 * base always reload to avoid an information leak.
608 if (unlikely(fsindex
| next
->fsindex
| prev
->fs
)) {
609 loadsegment(fs
, next
->fsindex
);
611 * Check if the user used a selector != 0; if yes
612 * clear 64bit base, since overloaded base is always
613 * mapped to the Null selector
618 /* when next process has a 64bit base use it */
620 wrmsrl(MSR_FS_BASE
, next
->fs
);
621 prev
->fsindex
= fsindex
;
623 if (unlikely(gsindex
| next
->gsindex
| prev
->gs
)) {
624 load_gs_index(next
->gsindex
);
629 wrmsrl(MSR_KERNEL_GS_BASE
, next
->gs
);
630 prev
->gsindex
= gsindex
;
632 /* Must be after DS reload */
636 * Switch the PDA and FPU contexts.
638 prev
->usersp
= percpu_read(old_rsp
);
639 percpu_write(old_rsp
, next
->usersp
);
640 percpu_write(current_task
, next_p
);
642 percpu_write(kernel_stack
,
643 (unsigned long)task_stack_page(next_p
) +
644 THREAD_SIZE
- KERNEL_STACK_OFFSET
);
647 * Now maybe reload the debug registers and handle I/O bitmaps
649 if (unlikely(task_thread_info(next_p
)->flags
& _TIF_WORK_CTXSW_NEXT
||
650 task_thread_info(prev_p
)->flags
& _TIF_WORK_CTXSW_PREV
))
651 __switch_to_xtra(prev_p
, next_p
, tss
);
653 /* If the task has used fpu the last 5 timeslices, just do a full
654 * restore of the math state immediately to avoid the trap; the
655 * chances of needing FPU soon are obviously high now
657 * tsk_used_math() checks prevent calling math_state_restore(),
658 * which can sleep in the case of !tsk_used_math()
660 if (tsk_used_math(next_p
) && next_p
->fpu_counter
> 5)
661 math_state_restore();
666 * sys_execve() executes a new program.
669 long sys_execve(char __user
*name
, char __user
* __user
*argv
,
670 char __user
* __user
*envp
, struct pt_regs
*regs
)
675 filename
= getname(name
);
676 error
= PTR_ERR(filename
);
677 if (IS_ERR(filename
))
679 error
= do_execve(filename
, argv
, envp
, regs
);
684 void set_personality_64bit(void)
686 /* inherit personality from parent */
688 /* Make sure to be in 64bit mode */
689 clear_thread_flag(TIF_IA32
);
691 /* TBD: overwrites user setup. Should have two bits.
692 But 64bit processes have always behaved this way,
693 so it's not too bad. The main problem is just that
694 32bit childs are affected again. */
695 current
->personality
&= ~READ_IMPLIES_EXEC
;
698 asmlinkage
long sys_fork(struct pt_regs
*regs
)
700 return do_fork(SIGCHLD
, regs
->sp
, regs
, 0, NULL
, NULL
);
704 sys_clone(unsigned long clone_flags
, unsigned long newsp
,
705 void __user
*parent_tid
, void __user
*child_tid
, struct pt_regs
*regs
)
709 return do_fork(clone_flags
, newsp
, regs
, 0, parent_tid
, child_tid
);
713 * This is trivial, and on the face of it looks like it
714 * could equally well be done in user mode.
716 * Not so, for quite unobvious reasons - register pressure.
717 * In user mode vfork() cannot have a stack frame, and if
718 * done by calling the "clone()" system call directly, you
719 * do not have enough call-clobbered registers to hold all
720 * the information you need.
722 asmlinkage
long sys_vfork(struct pt_regs
*regs
)
724 return do_fork(CLONE_VFORK
| CLONE_VM
| SIGCHLD
, regs
->sp
, regs
, 0,
728 unsigned long get_wchan(struct task_struct
*p
)
734 if (!p
|| p
== current
|| p
->state
== TASK_RUNNING
)
736 stack
= (unsigned long)task_stack_page(p
);
737 if (p
->thread
.sp
< stack
|| p
->thread
.sp
>= stack
+THREAD_SIZE
)
739 fp
= *(u64
*)(p
->thread
.sp
);
741 if (fp
< (unsigned long)stack
||
742 fp
>= (unsigned long)stack
+THREAD_SIZE
)
745 if (!in_sched_functions(ip
))
748 } while (count
++ < 16);
752 long do_arch_prctl(struct task_struct
*task
, int code
, unsigned long addr
)
755 int doit
= task
== current
;
760 if (addr
>= TASK_SIZE_OF(task
))
763 /* handle small bases via the GDT because that's faster to
765 if (addr
<= 0xffffffff) {
766 set_32bit_tls(task
, GS_TLS
, addr
);
768 load_TLS(&task
->thread
, cpu
);
769 load_gs_index(GS_TLS_SEL
);
771 task
->thread
.gsindex
= GS_TLS_SEL
;
774 task
->thread
.gsindex
= 0;
775 task
->thread
.gs
= addr
;
778 ret
= checking_wrmsrl(MSR_KERNEL_GS_BASE
, addr
);
784 /* Not strictly needed for fs, but do it for symmetry
786 if (addr
>= TASK_SIZE_OF(task
))
789 /* handle small bases via the GDT because that's faster to
791 if (addr
<= 0xffffffff) {
792 set_32bit_tls(task
, FS_TLS
, addr
);
794 load_TLS(&task
->thread
, cpu
);
795 loadsegment(fs
, FS_TLS_SEL
);
797 task
->thread
.fsindex
= FS_TLS_SEL
;
800 task
->thread
.fsindex
= 0;
801 task
->thread
.fs
= addr
;
803 /* set the selector to 0 to not confuse
806 ret
= checking_wrmsrl(MSR_FS_BASE
, addr
);
813 if (task
->thread
.fsindex
== FS_TLS_SEL
)
814 base
= read_32bit_tls(task
, FS_TLS
);
816 rdmsrl(MSR_FS_BASE
, base
);
818 base
= task
->thread
.fs
;
819 ret
= put_user(base
, (unsigned long __user
*)addr
);
825 if (task
->thread
.gsindex
== GS_TLS_SEL
)
826 base
= read_32bit_tls(task
, GS_TLS
);
828 savesegment(gs
, gsindex
);
830 rdmsrl(MSR_KERNEL_GS_BASE
, base
);
832 base
= task
->thread
.gs
;
834 base
= task
->thread
.gs
;
835 ret
= put_user(base
, (unsigned long __user
*)addr
);
847 long sys_arch_prctl(int code
, unsigned long addr
)
849 return do_arch_prctl(current
, code
, addr
);
852 unsigned long arch_align_stack(unsigned long sp
)
854 if (!(current
->personality
& ADDR_NO_RANDOMIZE
) && randomize_va_space
)
855 sp
-= get_random_int() % 8192;
859 unsigned long arch_randomize_brk(struct mm_struct
*mm
)
861 unsigned long range_end
= mm
->brk
+ 0x02000000;
862 return randomize_range(mm
->brk
, range_end
, 0) ? : mm
->brk
;