]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blob - arch/x86/kernel/process_64.c
x86: improve default idle
[mirror_ubuntu-artful-kernel.git] / arch / x86 / kernel / process_64.c
1 /*
2 * Copyright (C) 1995 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
6 *
7 * X86-64 port
8 * Andi Kleen.
9 *
10 * CPU hotplug support - ashok.raj@intel.com
11 */
12
13 /*
14 * This file handles the architecture-dependent parts of process handling..
15 */
16
17 #include <stdarg.h>
18
19 #include <linux/cpu.h>
20 #include <linux/errno.h>
21 #include <linux/sched.h>
22 #include <linux/fs.h>
23 #include <linux/kernel.h>
24 #include <linux/mm.h>
25 #include <linux/elfcore.h>
26 #include <linux/smp.h>
27 #include <linux/slab.h>
28 #include <linux/user.h>
29 #include <linux/interrupt.h>
30 #include <linux/utsname.h>
31 #include <linux/delay.h>
32 #include <linux/module.h>
33 #include <linux/ptrace.h>
34 #include <linux/random.h>
35 #include <linux/notifier.h>
36 #include <linux/kprobes.h>
37 #include <linux/kdebug.h>
38 #include <linux/tick.h>
39
40 #include <asm/uaccess.h>
41 #include <asm/pgtable.h>
42 #include <asm/system.h>
43 #include <asm/io.h>
44 #include <asm/processor.h>
45 #include <asm/i387.h>
46 #include <asm/mmu_context.h>
47 #include <asm/pda.h>
48 #include <asm/prctl.h>
49 #include <asm/desc.h>
50 #include <asm/proto.h>
51 #include <asm/ia32.h>
52 #include <asm/idle.h>
53
54 asmlinkage extern void ret_from_fork(void);
55
56 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
57
58 unsigned long boot_option_idle_override = 0;
59 EXPORT_SYMBOL(boot_option_idle_override);
60
61 /*
62 * Powermanagement idle function, if any..
63 */
64 void (*pm_idle)(void);
65 EXPORT_SYMBOL(pm_idle);
66
67 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
68
69 void idle_notifier_register(struct notifier_block *n)
70 {
71 atomic_notifier_chain_register(&idle_notifier, n);
72 }
73
74 void enter_idle(void)
75 {
76 write_pda(isidle, 1);
77 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
78 }
79
80 static void __exit_idle(void)
81 {
82 if (test_and_clear_bit_pda(0, isidle) == 0)
83 return;
84 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
85 }
86
87 /* Called from interrupts to signify idle end */
88 void exit_idle(void)
89 {
90 /* idle loop has pid 0 */
91 if (current->pid)
92 return;
93 __exit_idle();
94 }
95
96 /*
97 * We use this if we don't have any better
98 * idle routine..
99 */
100 void default_idle(void)
101 {
102 current_thread_info()->status &= ~TS_POLLING;
103 /*
104 * TS_POLLING-cleared state must be visible before we
105 * test NEED_RESCHED:
106 */
107 smp_mb();
108 local_irq_disable();
109 if (!need_resched()) {
110 safe_halt(); /* enables interrupts racelessly */
111 local_irq_disable();
112 }
113 local_irq_enable();
114 current_thread_info()->status |= TS_POLLING;
115 }
116
117 /*
118 * On SMP it's slightly faster (but much more power-consuming!)
119 * to poll the ->need_resched flag instead of waiting for the
120 * cross-CPU IPI to arrive. Use this option with caution.
121 */
122 static void poll_idle(void)
123 {
124 local_irq_enable();
125 cpu_relax();
126 }
127
128 #ifdef CONFIG_HOTPLUG_CPU
129 DECLARE_PER_CPU(int, cpu_state);
130
131 #include <asm/nmi.h>
132 /* We halt the CPU with physical CPU hotplug */
133 static inline void play_dead(void)
134 {
135 idle_task_exit();
136 wbinvd();
137 mb();
138 /* Ack it */
139 __get_cpu_var(cpu_state) = CPU_DEAD;
140
141 local_irq_disable();
142 while (1)
143 halt();
144 }
145 #else
146 static inline void play_dead(void)
147 {
148 BUG();
149 }
150 #endif /* CONFIG_HOTPLUG_CPU */
151
152 /*
153 * The idle thread. There's no useful work to be
154 * done, so just try to conserve power and have a
155 * low exit latency (ie sit in a loop waiting for
156 * somebody to say that they'd like to reschedule)
157 */
158 void cpu_idle(void)
159 {
160 current_thread_info()->status |= TS_POLLING;
161 /* endless idle loop with no priority at all */
162 while (1) {
163 tick_nohz_stop_sched_tick();
164 while (!need_resched()) {
165 void (*idle)(void);
166
167 rmb();
168 idle = pm_idle;
169 if (!idle)
170 idle = default_idle;
171 if (cpu_is_offline(smp_processor_id()))
172 play_dead();
173 /*
174 * Idle routines should keep interrupts disabled
175 * from here on, until they go to idle.
176 * Otherwise, idle callbacks can misfire.
177 */
178 local_irq_disable();
179 enter_idle();
180 idle();
181 /* In many cases the interrupt that ended idle
182 has already called exit_idle. But some idle
183 loops can be woken up without interrupt. */
184 __exit_idle();
185 }
186
187 tick_nohz_restart_sched_tick();
188 preempt_enable_no_resched();
189 schedule();
190 preempt_disable();
191 }
192 }
193
194 static void do_nothing(void *unused)
195 {
196 }
197
198 /*
199 * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
200 * pm_idle and update to new pm_idle value. Required while changing pm_idle
201 * handler on SMP systems.
202 *
203 * Caller must have changed pm_idle to the new value before the call. Old
204 * pm_idle value will not be used by any CPU after the return of this function.
205 */
206 void cpu_idle_wait(void)
207 {
208 smp_mb();
209 /* kick all the CPUs so that they exit out of pm_idle */
210 smp_call_function(do_nothing, NULL, 0, 1);
211 }
212 EXPORT_SYMBOL_GPL(cpu_idle_wait);
213
214 /*
215 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
216 * which can obviate IPI to trigger checking of need_resched.
217 * We execute MONITOR against need_resched and enter optimized wait state
218 * through MWAIT. Whenever someone changes need_resched, we would be woken
219 * up from MWAIT (without an IPI).
220 *
221 * New with Core Duo processors, MWAIT can take some hints based on CPU
222 * capability.
223 */
224 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
225 {
226 if (!need_resched()) {
227 __monitor((void *)&current_thread_info()->flags, 0, 0);
228 smp_mb();
229 if (!need_resched())
230 __mwait(ax, cx);
231 }
232 }
233
234 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
235 static void mwait_idle(void)
236 {
237 if (!need_resched()) {
238 __monitor((void *)&current_thread_info()->flags, 0, 0);
239 smp_mb();
240 if (!need_resched())
241 __sti_mwait(0, 0);
242 else
243 local_irq_enable();
244 } else {
245 local_irq_enable();
246 }
247 }
248
249
250 static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
251 {
252 if (force_mwait)
253 return 1;
254 /* Any C1 states supported? */
255 return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0;
256 }
257
258 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
259 {
260 static int selected;
261
262 if (selected)
263 return;
264 #ifdef CONFIG_X86_SMP
265 if (pm_idle == poll_idle && smp_num_siblings > 1) {
266 printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
267 " performance may degrade.\n");
268 }
269 #endif
270 if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
271 /*
272 * Skip, if setup has overridden idle.
273 * One CPU supports mwait => All CPUs supports mwait
274 */
275 if (!pm_idle) {
276 printk(KERN_INFO "using mwait in idle threads.\n");
277 pm_idle = mwait_idle;
278 }
279 }
280 selected = 1;
281 }
282
283 static int __init idle_setup(char *str)
284 {
285 if (!strcmp(str, "poll")) {
286 printk("using polling idle threads.\n");
287 pm_idle = poll_idle;
288 } else if (!strcmp(str, "mwait"))
289 force_mwait = 1;
290 else
291 return -1;
292
293 boot_option_idle_override = 1;
294 return 0;
295 }
296 early_param("idle", idle_setup);
297
298 /* Prints also some state that isn't saved in the pt_regs */
299 void __show_regs(struct pt_regs * regs)
300 {
301 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
302 unsigned long d0, d1, d2, d3, d6, d7;
303 unsigned int fsindex, gsindex;
304 unsigned int ds, cs, es;
305
306 printk("\n");
307 print_modules();
308 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
309 current->pid, current->comm, print_tainted(),
310 init_utsname()->release,
311 (int)strcspn(init_utsname()->version, " "),
312 init_utsname()->version);
313 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
314 printk_address(regs->ip, 1);
315 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp,
316 regs->flags);
317 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
318 regs->ax, regs->bx, regs->cx);
319 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
320 regs->dx, regs->si, regs->di);
321 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
322 regs->bp, regs->r8, regs->r9);
323 printk("R10: %016lx R11: %016lx R12: %016lx\n",
324 regs->r10, regs->r11, regs->r12);
325 printk("R13: %016lx R14: %016lx R15: %016lx\n",
326 regs->r13, regs->r14, regs->r15);
327
328 asm("movl %%ds,%0" : "=r" (ds));
329 asm("movl %%cs,%0" : "=r" (cs));
330 asm("movl %%es,%0" : "=r" (es));
331 asm("movl %%fs,%0" : "=r" (fsindex));
332 asm("movl %%gs,%0" : "=r" (gsindex));
333
334 rdmsrl(MSR_FS_BASE, fs);
335 rdmsrl(MSR_GS_BASE, gs);
336 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
337
338 cr0 = read_cr0();
339 cr2 = read_cr2();
340 cr3 = read_cr3();
341 cr4 = read_cr4();
342
343 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
344 fs,fsindex,gs,gsindex,shadowgs);
345 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
346 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
347
348 get_debugreg(d0, 0);
349 get_debugreg(d1, 1);
350 get_debugreg(d2, 2);
351 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
352 get_debugreg(d3, 3);
353 get_debugreg(d6, 6);
354 get_debugreg(d7, 7);
355 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
356 }
357
358 void show_regs(struct pt_regs *regs)
359 {
360 printk("CPU %d:", smp_processor_id());
361 __show_regs(regs);
362 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
363 }
364
365 /*
366 * Free current thread data structures etc..
367 */
368 void exit_thread(void)
369 {
370 struct task_struct *me = current;
371 struct thread_struct *t = &me->thread;
372
373 if (me->thread.io_bitmap_ptr) {
374 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
375
376 kfree(t->io_bitmap_ptr);
377 t->io_bitmap_ptr = NULL;
378 clear_thread_flag(TIF_IO_BITMAP);
379 /*
380 * Careful, clear this in the TSS too:
381 */
382 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
383 t->io_bitmap_max = 0;
384 put_cpu();
385 }
386 }
387
388 void flush_thread(void)
389 {
390 struct task_struct *tsk = current;
391
392 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
393 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
394 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
395 clear_tsk_thread_flag(tsk, TIF_IA32);
396 } else {
397 set_tsk_thread_flag(tsk, TIF_IA32);
398 current_thread_info()->status |= TS_COMPAT;
399 }
400 }
401 clear_tsk_thread_flag(tsk, TIF_DEBUG);
402
403 tsk->thread.debugreg0 = 0;
404 tsk->thread.debugreg1 = 0;
405 tsk->thread.debugreg2 = 0;
406 tsk->thread.debugreg3 = 0;
407 tsk->thread.debugreg6 = 0;
408 tsk->thread.debugreg7 = 0;
409 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
410 /*
411 * Forget coprocessor state..
412 */
413 clear_fpu(tsk);
414 clear_used_math();
415 }
416
417 void release_thread(struct task_struct *dead_task)
418 {
419 if (dead_task->mm) {
420 if (dead_task->mm->context.size) {
421 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
422 dead_task->comm,
423 dead_task->mm->context.ldt,
424 dead_task->mm->context.size);
425 BUG();
426 }
427 }
428 }
429
430 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
431 {
432 struct user_desc ud = {
433 .base_addr = addr,
434 .limit = 0xfffff,
435 .seg_32bit = 1,
436 .limit_in_pages = 1,
437 .useable = 1,
438 };
439 struct desc_struct *desc = t->thread.tls_array;
440 desc += tls;
441 fill_ldt(desc, &ud);
442 }
443
444 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
445 {
446 return get_desc_base(&t->thread.tls_array[tls]);
447 }
448
449 /*
450 * This gets called before we allocate a new thread and copy
451 * the current task into it.
452 */
453 void prepare_to_copy(struct task_struct *tsk)
454 {
455 unlazy_fpu(tsk);
456 }
457
458 int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
459 unsigned long unused,
460 struct task_struct * p, struct pt_regs * regs)
461 {
462 int err;
463 struct pt_regs * childregs;
464 struct task_struct *me = current;
465
466 childregs = ((struct pt_regs *)
467 (THREAD_SIZE + task_stack_page(p))) - 1;
468 *childregs = *regs;
469
470 childregs->ax = 0;
471 childregs->sp = sp;
472 if (sp == ~0UL)
473 childregs->sp = (unsigned long)childregs;
474
475 p->thread.sp = (unsigned long) childregs;
476 p->thread.sp0 = (unsigned long) (childregs+1);
477 p->thread.usersp = me->thread.usersp;
478
479 set_tsk_thread_flag(p, TIF_FORK);
480
481 p->thread.fs = me->thread.fs;
482 p->thread.gs = me->thread.gs;
483
484 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
485 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
486 asm("mov %%es,%0" : "=m" (p->thread.es));
487 asm("mov %%ds,%0" : "=m" (p->thread.ds));
488
489 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
490 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
491 if (!p->thread.io_bitmap_ptr) {
492 p->thread.io_bitmap_max = 0;
493 return -ENOMEM;
494 }
495 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
496 IO_BITMAP_BYTES);
497 set_tsk_thread_flag(p, TIF_IO_BITMAP);
498 }
499
500 /*
501 * Set a new TLS for the child thread?
502 */
503 if (clone_flags & CLONE_SETTLS) {
504 #ifdef CONFIG_IA32_EMULATION
505 if (test_thread_flag(TIF_IA32))
506 err = do_set_thread_area(p, -1,
507 (struct user_desc __user *)childregs->si, 0);
508 else
509 #endif
510 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
511 if (err)
512 goto out;
513 }
514 err = 0;
515 out:
516 if (err && p->thread.io_bitmap_ptr) {
517 kfree(p->thread.io_bitmap_ptr);
518 p->thread.io_bitmap_max = 0;
519 }
520 return err;
521 }
522
523 void
524 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
525 {
526 asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
527 load_gs_index(0);
528 regs->ip = new_ip;
529 regs->sp = new_sp;
530 write_pda(oldrsp, new_sp);
531 regs->cs = __USER_CS;
532 regs->ss = __USER_DS;
533 regs->flags = 0x200;
534 set_fs(USER_DS);
535 }
536 EXPORT_SYMBOL_GPL(start_thread);
537
538 /*
539 * This special macro can be used to load a debugging register
540 */
541 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
542
543 static inline void __switch_to_xtra(struct task_struct *prev_p,
544 struct task_struct *next_p,
545 struct tss_struct *tss)
546 {
547 struct thread_struct *prev, *next;
548 unsigned long debugctl;
549
550 prev = &prev_p->thread,
551 next = &next_p->thread;
552
553 debugctl = prev->debugctlmsr;
554 if (next->ds_area_msr != prev->ds_area_msr) {
555 /* we clear debugctl to make sure DS
556 * is not in use when we change it */
557 debugctl = 0;
558 update_debugctlmsr(0);
559 wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
560 }
561
562 if (next->debugctlmsr != debugctl)
563 update_debugctlmsr(next->debugctlmsr);
564
565 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
566 loaddebug(next, 0);
567 loaddebug(next, 1);
568 loaddebug(next, 2);
569 loaddebug(next, 3);
570 /* no 4 and 5 */
571 loaddebug(next, 6);
572 loaddebug(next, 7);
573 }
574
575 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
576 /*
577 * Copy the relevant range of the IO bitmap.
578 * Normally this is 128 bytes or less:
579 */
580 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
581 max(prev->io_bitmap_max, next->io_bitmap_max));
582 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
583 /*
584 * Clear any possible leftover bits:
585 */
586 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
587 }
588
589 #ifdef X86_BTS
590 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
591 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
592
593 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
594 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
595 #endif
596 }
597
598 /*
599 * switch_to(x,y) should switch tasks from x to y.
600 *
601 * This could still be optimized:
602 * - fold all the options into a flag word and test it with a single test.
603 * - could test fs/gs bitsliced
604 *
605 * Kprobes not supported here. Set the probe on schedule instead.
606 */
607 struct task_struct *
608 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
609 {
610 struct thread_struct *prev = &prev_p->thread,
611 *next = &next_p->thread;
612 int cpu = smp_processor_id();
613 struct tss_struct *tss = &per_cpu(init_tss, cpu);
614
615 /* we're going to use this soon, after a few expensive things */
616 if (next_p->fpu_counter>5)
617 prefetch(&next->i387.fxsave);
618
619 /*
620 * Reload esp0, LDT and the page table pointer:
621 */
622 load_sp0(tss, next);
623
624 /*
625 * Switch DS and ES.
626 * This won't pick up thread selector changes, but I guess that is ok.
627 */
628 asm volatile("mov %%es,%0" : "=m" (prev->es));
629 if (unlikely(next->es | prev->es))
630 loadsegment(es, next->es);
631
632 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
633 if (unlikely(next->ds | prev->ds))
634 loadsegment(ds, next->ds);
635
636 load_TLS(next, cpu);
637
638 /*
639 * Switch FS and GS.
640 */
641 {
642 unsigned fsindex;
643 asm volatile("movl %%fs,%0" : "=r" (fsindex));
644 /* segment register != 0 always requires a reload.
645 also reload when it has changed.
646 when prev process used 64bit base always reload
647 to avoid an information leak. */
648 if (unlikely(fsindex | next->fsindex | prev->fs)) {
649 loadsegment(fs, next->fsindex);
650 /* check if the user used a selector != 0
651 * if yes clear 64bit base, since overloaded base
652 * is always mapped to the Null selector
653 */
654 if (fsindex)
655 prev->fs = 0;
656 }
657 /* when next process has a 64bit base use it */
658 if (next->fs)
659 wrmsrl(MSR_FS_BASE, next->fs);
660 prev->fsindex = fsindex;
661 }
662 {
663 unsigned gsindex;
664 asm volatile("movl %%gs,%0" : "=r" (gsindex));
665 if (unlikely(gsindex | next->gsindex | prev->gs)) {
666 load_gs_index(next->gsindex);
667 if (gsindex)
668 prev->gs = 0;
669 }
670 if (next->gs)
671 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
672 prev->gsindex = gsindex;
673 }
674
675 /* Must be after DS reload */
676 unlazy_fpu(prev_p);
677
678 /*
679 * Switch the PDA and FPU contexts.
680 */
681 prev->usersp = read_pda(oldrsp);
682 write_pda(oldrsp, next->usersp);
683 write_pda(pcurrent, next_p);
684
685 write_pda(kernelstack,
686 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
687 #ifdef CONFIG_CC_STACKPROTECTOR
688 write_pda(stack_canary, next_p->stack_canary);
689 /*
690 * Build time only check to make sure the stack_canary is at
691 * offset 40 in the pda; this is a gcc ABI requirement
692 */
693 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
694 #endif
695
696 /*
697 * Now maybe reload the debug registers and handle I/O bitmaps
698 */
699 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
700 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
701 __switch_to_xtra(prev_p, next_p, tss);
702
703 /* If the task has used fpu the last 5 timeslices, just do a full
704 * restore of the math state immediately to avoid the trap; the
705 * chances of needing FPU soon are obviously high now
706 */
707 if (next_p->fpu_counter>5)
708 math_state_restore();
709 return prev_p;
710 }
711
712 /*
713 * sys_execve() executes a new program.
714 */
715 asmlinkage
716 long sys_execve(char __user *name, char __user * __user *argv,
717 char __user * __user *envp, struct pt_regs *regs)
718 {
719 long error;
720 char * filename;
721
722 filename = getname(name);
723 error = PTR_ERR(filename);
724 if (IS_ERR(filename))
725 return error;
726 error = do_execve(filename, argv, envp, regs);
727 putname(filename);
728 return error;
729 }
730
731 void set_personality_64bit(void)
732 {
733 /* inherit personality from parent */
734
735 /* Make sure to be in 64bit mode */
736 clear_thread_flag(TIF_IA32);
737
738 /* TBD: overwrites user setup. Should have two bits.
739 But 64bit processes have always behaved this way,
740 so it's not too bad. The main problem is just that
741 32bit childs are affected again. */
742 current->personality &= ~READ_IMPLIES_EXEC;
743 }
744
745 asmlinkage long sys_fork(struct pt_regs *regs)
746 {
747 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
748 }
749
750 asmlinkage long
751 sys_clone(unsigned long clone_flags, unsigned long newsp,
752 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
753 {
754 if (!newsp)
755 newsp = regs->sp;
756 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
757 }
758
759 /*
760 * This is trivial, and on the face of it looks like it
761 * could equally well be done in user mode.
762 *
763 * Not so, for quite unobvious reasons - register pressure.
764 * In user mode vfork() cannot have a stack frame, and if
765 * done by calling the "clone()" system call directly, you
766 * do not have enough call-clobbered registers to hold all
767 * the information you need.
768 */
769 asmlinkage long sys_vfork(struct pt_regs *regs)
770 {
771 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
772 NULL, NULL);
773 }
774
775 unsigned long get_wchan(struct task_struct *p)
776 {
777 unsigned long stack;
778 u64 fp,ip;
779 int count = 0;
780
781 if (!p || p == current || p->state==TASK_RUNNING)
782 return 0;
783 stack = (unsigned long)task_stack_page(p);
784 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
785 return 0;
786 fp = *(u64 *)(p->thread.sp);
787 do {
788 if (fp < (unsigned long)stack ||
789 fp > (unsigned long)stack+THREAD_SIZE)
790 return 0;
791 ip = *(u64 *)(fp+8);
792 if (!in_sched_functions(ip))
793 return ip;
794 fp = *(u64 *)fp;
795 } while (count++ < 16);
796 return 0;
797 }
798
799 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
800 {
801 int ret = 0;
802 int doit = task == current;
803 int cpu;
804
805 switch (code) {
806 case ARCH_SET_GS:
807 if (addr >= TASK_SIZE_OF(task))
808 return -EPERM;
809 cpu = get_cpu();
810 /* handle small bases via the GDT because that's faster to
811 switch. */
812 if (addr <= 0xffffffff) {
813 set_32bit_tls(task, GS_TLS, addr);
814 if (doit) {
815 load_TLS(&task->thread, cpu);
816 load_gs_index(GS_TLS_SEL);
817 }
818 task->thread.gsindex = GS_TLS_SEL;
819 task->thread.gs = 0;
820 } else {
821 task->thread.gsindex = 0;
822 task->thread.gs = addr;
823 if (doit) {
824 load_gs_index(0);
825 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
826 }
827 }
828 put_cpu();
829 break;
830 case ARCH_SET_FS:
831 /* Not strictly needed for fs, but do it for symmetry
832 with gs */
833 if (addr >= TASK_SIZE_OF(task))
834 return -EPERM;
835 cpu = get_cpu();
836 /* handle small bases via the GDT because that's faster to
837 switch. */
838 if (addr <= 0xffffffff) {
839 set_32bit_tls(task, FS_TLS, addr);
840 if (doit) {
841 load_TLS(&task->thread, cpu);
842 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
843 }
844 task->thread.fsindex = FS_TLS_SEL;
845 task->thread.fs = 0;
846 } else {
847 task->thread.fsindex = 0;
848 task->thread.fs = addr;
849 if (doit) {
850 /* set the selector to 0 to not confuse
851 __switch_to */
852 asm volatile("movl %0,%%fs" :: "r" (0));
853 ret = checking_wrmsrl(MSR_FS_BASE, addr);
854 }
855 }
856 put_cpu();
857 break;
858 case ARCH_GET_FS: {
859 unsigned long base;
860 if (task->thread.fsindex == FS_TLS_SEL)
861 base = read_32bit_tls(task, FS_TLS);
862 else if (doit)
863 rdmsrl(MSR_FS_BASE, base);
864 else
865 base = task->thread.fs;
866 ret = put_user(base, (unsigned long __user *)addr);
867 break;
868 }
869 case ARCH_GET_GS: {
870 unsigned long base;
871 unsigned gsindex;
872 if (task->thread.gsindex == GS_TLS_SEL)
873 base = read_32bit_tls(task, GS_TLS);
874 else if (doit) {
875 asm("movl %%gs,%0" : "=r" (gsindex));
876 if (gsindex)
877 rdmsrl(MSR_KERNEL_GS_BASE, base);
878 else
879 base = task->thread.gs;
880 }
881 else
882 base = task->thread.gs;
883 ret = put_user(base, (unsigned long __user *)addr);
884 break;
885 }
886
887 default:
888 ret = -EINVAL;
889 break;
890 }
891
892 return ret;
893 }
894
895 long sys_arch_prctl(int code, unsigned long addr)
896 {
897 return do_arch_prctl(current, code, addr);
898 }
899
900 unsigned long arch_align_stack(unsigned long sp)
901 {
902 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
903 sp -= get_random_int() % 8192;
904 return sp & ~0xf;
905 }
906
907 unsigned long arch_randomize_brk(struct mm_struct *mm)
908 {
909 unsigned long range_end = mm->brk + 0x02000000;
910 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
911 }