2 * A hypervisor allows multiple Operating Systems to run on a single machine.
3 * To quote David Wheeler: "Any problem in computer science can be solved with
4 * another layer of indirection."
6 * We keep things simple in two ways. First, we start with a normal Linux
7 * kernel and insert a module (lg.ko) which allows us to run other Linux
8 * kernels the same way we'd run processes. We call the first kernel the Host,
9 * and the others the Guests. The program which sets up and configures Guests
10 * (such as the example in Documentation/lguest/lguest.c) is called the
13 * Secondly, we only run specially modified Guests, not normal kernels. When
14 * you set CONFIG_LGUEST to 'y' or 'm', this automatically sets
15 * CONFIG_LGUEST_GUEST=y, which compiles this file into the kernel so it knows
16 * how to be a Guest. This means that you can use the same kernel you boot
17 * normally (ie. as a Host) as a Guest.
19 * These Guests know that they cannot do privileged operations, such as disable
20 * interrupts, and that they have to ask the Host to do such things explicitly.
21 * This file consists of all the replacements for such low-level native
22 * hardware operations: these special Guest versions call the Host.
24 * So how does the kernel know it's a Guest? The Guest starts at a special
25 * entry point marked with a magic string, which sets up a few things then
26 * calls here. We replace the native functions in "struct paravirt_ops"
27 * with our Guest versions, then boot like normal. :*/
30 * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
32 * This program is free software; you can redistribute it and/or modify
33 * it under the terms of the GNU General Public License as published by
34 * the Free Software Foundation; either version 2 of the License, or
35 * (at your option) any later version.
37 * This program is distributed in the hope that it will be useful, but
38 * WITHOUT ANY WARRANTY; without even the implied warranty of
39 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
40 * NON INFRINGEMENT. See the GNU General Public License for more
43 * You should have received a copy of the GNU General Public License
44 * along with this program; if not, write to the Free Software
45 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
47 #include <linux/kernel.h>
48 #include <linux/start_kernel.h>
49 #include <linux/string.h>
50 #include <linux/console.h>
51 #include <linux/screen_info.h>
52 #include <linux/irq.h>
53 #include <linux/interrupt.h>
54 #include <linux/clocksource.h>
55 #include <linux/clockchips.h>
56 #include <linux/lguest.h>
57 #include <linux/lguest_launcher.h>
58 #include <linux/lguest_bus.h>
59 #include <asm/paravirt.h>
60 #include <asm/param.h>
62 #include <asm/pgtable.h>
64 #include <asm/setup.h>
69 /* Declarations for definitions in lguest_guest.S */
70 extern char lguest_noirq_start
[], lguest_noirq_end
[];
71 extern const char lgstart_cli
[], lgend_cli
[];
72 extern const char lgstart_sti
[], lgend_sti
[];
73 extern const char lgstart_popf
[], lgend_popf
[];
74 extern const char lgstart_pushf
[], lgend_pushf
[];
75 extern const char lgstart_iret
[], lgend_iret
[];
76 extern void lguest_iret(void);
78 struct lguest_data lguest_data
= {
79 .hcall_status
= { [0 ... LHCALL_RING_SIZE
-1] = 0xFF },
80 .noirq_start
= (u32
)lguest_noirq_start
,
81 .noirq_end
= (u32
)lguest_noirq_end
,
82 .blocked_interrupts
= { 1 }, /* Block timer interrupts */
84 struct lguest_device_desc
*lguest_devices
;
85 static cycle_t clock_base
;
87 static enum paravirt_lazy_mode lazy_mode
;
88 static void lguest_lazy_mode(enum paravirt_lazy_mode mode
)
90 if (mode
== PARAVIRT_LAZY_FLUSH
) {
91 if (unlikely(lazy_mode
!= PARAVIRT_LAZY_NONE
))
92 hcall(LHCALL_FLUSH_ASYNC
, 0, 0, 0);
95 if (mode
== PARAVIRT_LAZY_NONE
)
96 hcall(LHCALL_FLUSH_ASYNC
, 0, 0, 0);
100 static void lazy_hcall(unsigned long call
,
105 if (lazy_mode
== PARAVIRT_LAZY_NONE
)
106 hcall(call
, arg1
, arg2
, arg3
);
108 async_hcall(call
, arg1
, arg2
, arg3
);
111 void async_hcall(unsigned long call
,
112 unsigned long arg1
, unsigned long arg2
, unsigned long arg3
)
114 /* Note: This code assumes we're uniprocessor. */
115 static unsigned int next_call
;
118 local_irq_save(flags
);
119 if (lguest_data
.hcall_status
[next_call
] != 0xFF) {
120 /* Table full, so do normal hcall which will flush table. */
121 hcall(call
, arg1
, arg2
, arg3
);
123 lguest_data
.hcalls
[next_call
].eax
= call
;
124 lguest_data
.hcalls
[next_call
].edx
= arg1
;
125 lguest_data
.hcalls
[next_call
].ebx
= arg2
;
126 lguest_data
.hcalls
[next_call
].ecx
= arg3
;
127 /* Make sure host sees arguments before "valid" flag. */
129 lguest_data
.hcall_status
[next_call
] = 0;
130 if (++next_call
== LHCALL_RING_SIZE
)
133 local_irq_restore(flags
);
136 void lguest_send_dma(unsigned long key
, struct lguest_dma
*dma
)
139 hcall(LHCALL_SEND_DMA
, key
, __pa(dma
), 0);
142 int lguest_bind_dma(unsigned long key
, struct lguest_dma
*dmas
,
143 unsigned int num
, u8 irq
)
145 if (!hcall(LHCALL_BIND_DMA
, key
, __pa(dmas
), (num
<< 8) | irq
))
150 void lguest_unbind_dma(unsigned long key
, struct lguest_dma
*dmas
)
152 hcall(LHCALL_BIND_DMA
, key
, __pa(dmas
), 0);
155 /* For guests, device memory can be used as normal memory, so we cast away the
156 * __iomem to quieten sparse. */
157 void *lguest_map(unsigned long phys_addr
, unsigned long pages
)
159 return (__force
void *)ioremap(phys_addr
, PAGE_SIZE
*pages
);
162 void lguest_unmap(void *addr
)
164 iounmap((__force
void __iomem
*)addr
);
167 static unsigned long save_fl(void)
169 return lguest_data
.irq_enabled
;
172 static void restore_fl(unsigned long flags
)
174 /* FIXME: Check if interrupt pending... */
175 lguest_data
.irq_enabled
= flags
;
178 static void irq_disable(void)
180 lguest_data
.irq_enabled
= 0;
183 static void irq_enable(void)
185 /* FIXME: Check if interrupt pending... */
186 lguest_data
.irq_enabled
= X86_EFLAGS_IF
;
189 static void lguest_write_idt_entry(struct desc_struct
*dt
,
190 int entrynum
, u32 low
, u32 high
)
192 write_dt_entry(dt
, entrynum
, low
, high
);
193 hcall(LHCALL_LOAD_IDT_ENTRY
, entrynum
, low
, high
);
196 static void lguest_load_idt(const struct Xgt_desc_struct
*desc
)
199 struct desc_struct
*idt
= (void *)desc
->address
;
201 for (i
= 0; i
< (desc
->size
+1)/8; i
++)
202 hcall(LHCALL_LOAD_IDT_ENTRY
, i
, idt
[i
].a
, idt
[i
].b
);
205 static void lguest_load_gdt(const struct Xgt_desc_struct
*desc
)
207 BUG_ON((desc
->size
+1)/8 != GDT_ENTRIES
);
208 hcall(LHCALL_LOAD_GDT
, __pa(desc
->address
), GDT_ENTRIES
, 0);
211 static void lguest_write_gdt_entry(struct desc_struct
*dt
,
212 int entrynum
, u32 low
, u32 high
)
214 write_dt_entry(dt
, entrynum
, low
, high
);
215 hcall(LHCALL_LOAD_GDT
, __pa(dt
), GDT_ENTRIES
, 0);
218 static void lguest_load_tls(struct thread_struct
*t
, unsigned int cpu
)
220 lazy_hcall(LHCALL_LOAD_TLS
, __pa(&t
->tls_array
), cpu
, 0);
223 static void lguest_set_ldt(const void *addr
, unsigned entries
)
227 static void lguest_load_tr_desc(void)
231 static void lguest_cpuid(unsigned int *eax
, unsigned int *ebx
,
232 unsigned int *ecx
, unsigned int *edx
)
236 native_cpuid(eax
, ebx
, ecx
, edx
);
238 case 1: /* Basic feature request. */
239 /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
241 /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */
243 /* Host wants to know when we flush kernel pages: set PGE. */
247 /* Futureproof this a little: if they ask how much extended
248 * processor information, limit it to known fields. */
249 if (*eax
> 0x80000008)
255 static unsigned long current_cr0
, current_cr3
;
256 static void lguest_write_cr0(unsigned long val
)
258 lazy_hcall(LHCALL_TS
, val
& 8, 0, 0);
262 static unsigned long lguest_read_cr0(void)
267 static void lguest_clts(void)
269 lazy_hcall(LHCALL_TS
, 0, 0, 0);
273 static unsigned long lguest_read_cr2(void)
275 return lguest_data
.cr2
;
278 static void lguest_write_cr3(unsigned long cr3
)
280 lazy_hcall(LHCALL_NEW_PGTABLE
, cr3
, 0, 0);
284 static unsigned long lguest_read_cr3(void)
289 /* Used to enable/disable PGE, but we don't care. */
290 static unsigned long lguest_read_cr4(void)
295 static void lguest_write_cr4(unsigned long val
)
299 static void lguest_set_pte_at(struct mm_struct
*mm
, unsigned long addr
,
300 pte_t
*ptep
, pte_t pteval
)
303 lazy_hcall(LHCALL_SET_PTE
, __pa(mm
->pgd
), addr
, pteval
.pte_low
);
306 /* We only support two-level pagetables at the moment. */
307 static void lguest_set_pmd(pmd_t
*pmdp
, pmd_t pmdval
)
310 lazy_hcall(LHCALL_SET_PMD
, __pa(pmdp
)&PAGE_MASK
,
311 (__pa(pmdp
)&(PAGE_SIZE
-1))/4, 0);
314 /* FIXME: Eliminate all callers of this. */
315 static void lguest_set_pte(pte_t
*ptep
, pte_t pteval
)
318 /* Don't bother with hypercall before initial setup. */
320 lazy_hcall(LHCALL_FLUSH_TLB
, 1, 0, 0);
323 static void lguest_flush_tlb_single(unsigned long addr
)
325 /* Simply set it to zero, and it will fault back in. */
326 lazy_hcall(LHCALL_SET_PTE
, current_cr3
, addr
, 0);
329 static void lguest_flush_tlb_user(void)
331 lazy_hcall(LHCALL_FLUSH_TLB
, 0, 0, 0);
334 static void lguest_flush_tlb_kernel(void)
336 lazy_hcall(LHCALL_FLUSH_TLB
, 1, 0, 0);
339 static void disable_lguest_irq(unsigned int irq
)
341 set_bit(irq
, lguest_data
.blocked_interrupts
);
344 static void enable_lguest_irq(unsigned int irq
)
346 clear_bit(irq
, lguest_data
.blocked_interrupts
);
347 /* FIXME: If it's pending? */
350 static struct irq_chip lguest_irq_controller
= {
352 .mask
= disable_lguest_irq
,
353 .mask_ack
= disable_lguest_irq
,
354 .unmask
= enable_lguest_irq
,
357 static void __init
lguest_init_IRQ(void)
361 for (i
= 0; i
< LGUEST_IRQS
; i
++) {
362 int vector
= FIRST_EXTERNAL_VECTOR
+ i
;
363 if (vector
!= SYSCALL_VECTOR
) {
364 set_intr_gate(vector
, interrupt
[i
]);
365 set_irq_chip_and_handler(i
, &lguest_irq_controller
,
369 irq_ctx_init(smp_processor_id());
372 static unsigned long lguest_get_wallclock(void)
374 return hcall(LHCALL_GET_WALLCLOCK
, 0, 0, 0);
377 static cycle_t
lguest_clock_read(void)
379 if (lguest_data
.tsc_khz
)
380 return native_read_tsc();
385 /* This is what we tell the kernel is our clocksource. */
386 static struct clocksource lguest_clock
= {
389 .read
= lguest_clock_read
,
392 static unsigned long long lguest_sched_clock(void)
394 return cyc2ns(&lguest_clock
, lguest_clock_read() - clock_base
);
397 /* We also need a "struct clock_event_device": Linux asks us to set it to go
398 * off some time in the future. Actually, James Morris figured all this out, I
399 * just applied the patch. */
400 static int lguest_clockevent_set_next_event(unsigned long delta
,
401 struct clock_event_device
*evt
)
403 if (delta
< LG_CLOCK_MIN_DELTA
) {
404 if (printk_ratelimit())
405 printk(KERN_DEBUG
"%s: small delta %lu ns\n",
406 __FUNCTION__
, delta
);
409 hcall(LHCALL_SET_CLOCKEVENT
, delta
, 0, 0);
413 static void lguest_clockevent_set_mode(enum clock_event_mode mode
,
414 struct clock_event_device
*evt
)
417 case CLOCK_EVT_MODE_UNUSED
:
418 case CLOCK_EVT_MODE_SHUTDOWN
:
419 /* A 0 argument shuts the clock down. */
420 hcall(LHCALL_SET_CLOCKEVENT
, 0, 0, 0);
422 case CLOCK_EVT_MODE_ONESHOT
:
423 /* This is what we expect. */
425 case CLOCK_EVT_MODE_PERIODIC
:
427 case CLOCK_EVT_MODE_RESUME
:
432 /* This describes our primitive timer chip. */
433 static struct clock_event_device lguest_clockevent
= {
435 .features
= CLOCK_EVT_FEAT_ONESHOT
,
436 .set_next_event
= lguest_clockevent_set_next_event
,
437 .set_mode
= lguest_clockevent_set_mode
,
441 .min_delta_ns
= LG_CLOCK_MIN_DELTA
,
442 .max_delta_ns
= LG_CLOCK_MAX_DELTA
,
445 /* This is the Guest timer interrupt handler (hardware interrupt 0). We just
446 * call the clockevent infrastructure and it does whatever needs doing. */
447 static void lguest_time_irq(unsigned int irq
, struct irq_desc
*desc
)
451 /* Don't interrupt us while this is running. */
452 local_irq_save(flags
);
453 lguest_clockevent
.event_handler(&lguest_clockevent
);
454 local_irq_restore(flags
);
457 static void lguest_time_init(void)
459 set_irq_handler(0, lguest_time_irq
);
461 /* We use the TSC if the Host tells us we can, otherwise a dumb
462 * jiffies-based clock. */
463 if (lguest_data
.tsc_khz
) {
464 lguest_clock
.shift
= 22;
465 lguest_clock
.mult
= clocksource_khz2mult(lguest_data
.tsc_khz
,
467 lguest_clock
.mask
= CLOCKSOURCE_MASK(64);
468 lguest_clock
.flags
= CLOCK_SOURCE_IS_CONTINUOUS
;
470 /* To understand this, start at kernel/time/jiffies.c... */
471 lguest_clock
.shift
= 8;
472 lguest_clock
.mult
= (((u64
)NSEC_PER_SEC
<<8)/ACTHZ
) << 8;
473 lguest_clock
.mask
= CLOCKSOURCE_MASK(32);
475 clock_base
= lguest_clock_read();
476 clocksource_register(&lguest_clock
);
478 /* We can't set cpumask in the initializer: damn C limitations! */
479 lguest_clockevent
.cpumask
= cpumask_of_cpu(0);
480 clockevents_register_device(&lguest_clockevent
);
482 enable_lguest_irq(0);
485 static void lguest_load_esp0(struct tss_struct
*tss
,
486 struct thread_struct
*thread
)
488 lazy_hcall(LHCALL_SET_STACK
, __KERNEL_DS
|0x1, thread
->esp0
,
489 THREAD_SIZE
/PAGE_SIZE
);
492 static void lguest_set_debugreg(int regno
, unsigned long value
)
494 /* FIXME: Implement */
497 static void lguest_wbinvd(void)
501 #ifdef CONFIG_X86_LOCAL_APIC
502 static void lguest_apic_write(unsigned long reg
, unsigned long v
)
506 static unsigned long lguest_apic_read(unsigned long reg
)
512 static void lguest_safe_halt(void)
514 hcall(LHCALL_HALT
, 0, 0, 0);
517 static void lguest_power_off(void)
519 hcall(LHCALL_CRASH
, __pa("Power down"), 0, 0);
522 static int lguest_panic(struct notifier_block
*nb
, unsigned long l
, void *p
)
524 hcall(LHCALL_CRASH
, __pa(p
), 0, 0);
528 static struct notifier_block paniced
= {
529 .notifier_call
= lguest_panic
532 static __init
char *lguest_memory_setup(void)
534 /* We do this here because lockcheck barfs if before start_kernel */
535 atomic_notifier_chain_register(&panic_notifier_list
, &paniced
);
537 add_memory_region(E820_MAP
->addr
, E820_MAP
->size
, E820_MAP
->type
);
541 static const struct lguest_insns
543 const char *start
, *end
;
545 [PARAVIRT_PATCH(irq_disable
)] = { lgstart_cli
, lgend_cli
},
546 [PARAVIRT_PATCH(irq_enable
)] = { lgstart_sti
, lgend_sti
},
547 [PARAVIRT_PATCH(restore_fl
)] = { lgstart_popf
, lgend_popf
},
548 [PARAVIRT_PATCH(save_fl
)] = { lgstart_pushf
, lgend_pushf
},
550 static unsigned lguest_patch(u8 type
, u16 clobber
, void *insns
, unsigned len
)
552 unsigned int insn_len
;
554 /* Don't touch it if we don't have a replacement */
555 if (type
>= ARRAY_SIZE(lguest_insns
) || !lguest_insns
[type
].start
)
556 return paravirt_patch_default(type
, clobber
, insns
, len
);
558 insn_len
= lguest_insns
[type
].end
- lguest_insns
[type
].start
;
560 /* Similarly if we can't fit replacement. */
562 return paravirt_patch_default(type
, clobber
, insns
, len
);
564 memcpy(insns
, lguest_insns
[type
].start
, insn_len
);
568 __init
void lguest_init(void *boot
)
570 /* Copy boot parameters first. */
571 memcpy(&boot_params
, boot
, PARAM_SIZE
);
572 memcpy(boot_command_line
, __va(boot_params
.hdr
.cmd_line_ptr
),
575 paravirt_ops
.name
= "lguest";
576 paravirt_ops
.paravirt_enabled
= 1;
577 paravirt_ops
.kernel_rpl
= 1;
579 paravirt_ops
.save_fl
= save_fl
;
580 paravirt_ops
.restore_fl
= restore_fl
;
581 paravirt_ops
.irq_disable
= irq_disable
;
582 paravirt_ops
.irq_enable
= irq_enable
;
583 paravirt_ops
.load_gdt
= lguest_load_gdt
;
584 paravirt_ops
.memory_setup
= lguest_memory_setup
;
585 paravirt_ops
.cpuid
= lguest_cpuid
;
586 paravirt_ops
.write_cr3
= lguest_write_cr3
;
587 paravirt_ops
.flush_tlb_user
= lguest_flush_tlb_user
;
588 paravirt_ops
.flush_tlb_single
= lguest_flush_tlb_single
;
589 paravirt_ops
.flush_tlb_kernel
= lguest_flush_tlb_kernel
;
590 paravirt_ops
.set_pte
= lguest_set_pte
;
591 paravirt_ops
.set_pte_at
= lguest_set_pte_at
;
592 paravirt_ops
.set_pmd
= lguest_set_pmd
;
593 #ifdef CONFIG_X86_LOCAL_APIC
594 paravirt_ops
.apic_write
= lguest_apic_write
;
595 paravirt_ops
.apic_write_atomic
= lguest_apic_write
;
596 paravirt_ops
.apic_read
= lguest_apic_read
;
598 paravirt_ops
.load_idt
= lguest_load_idt
;
599 paravirt_ops
.iret
= lguest_iret
;
600 paravirt_ops
.load_esp0
= lguest_load_esp0
;
601 paravirt_ops
.load_tr_desc
= lguest_load_tr_desc
;
602 paravirt_ops
.set_ldt
= lguest_set_ldt
;
603 paravirt_ops
.load_tls
= lguest_load_tls
;
604 paravirt_ops
.set_debugreg
= lguest_set_debugreg
;
605 paravirt_ops
.clts
= lguest_clts
;
606 paravirt_ops
.read_cr0
= lguest_read_cr0
;
607 paravirt_ops
.write_cr0
= lguest_write_cr0
;
608 paravirt_ops
.init_IRQ
= lguest_init_IRQ
;
609 paravirt_ops
.read_cr2
= lguest_read_cr2
;
610 paravirt_ops
.read_cr3
= lguest_read_cr3
;
611 paravirt_ops
.read_cr4
= lguest_read_cr4
;
612 paravirt_ops
.write_cr4
= lguest_write_cr4
;
613 paravirt_ops
.write_gdt_entry
= lguest_write_gdt_entry
;
614 paravirt_ops
.write_idt_entry
= lguest_write_idt_entry
;
615 paravirt_ops
.patch
= lguest_patch
;
616 paravirt_ops
.safe_halt
= lguest_safe_halt
;
617 paravirt_ops
.get_wallclock
= lguest_get_wallclock
;
618 paravirt_ops
.time_init
= lguest_time_init
;
619 paravirt_ops
.set_lazy_mode
= lguest_lazy_mode
;
620 paravirt_ops
.wbinvd
= lguest_wbinvd
;
621 paravirt_ops
.sched_clock
= lguest_sched_clock
;
623 hcall(LHCALL_LGUEST_INIT
, __pa(&lguest_data
), 0, 0);
625 /* We use top of mem for initial pagetables. */
626 init_pg_tables_end
= __pa(pg0
);
628 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS
) : "memory");
630 reserve_top_address(lguest_data
.reserve_mem
);
634 paravirt_disable_iospace();
636 cpu_detect(&new_cpu_data
);
637 /* head.S usually sets up the first capability word, so do it here. */
638 new_cpu_data
.x86_capability
[0] = cpuid_edx(1);
640 /* Math is always hard! */
641 new_cpu_data
.hard_math
= 1;
643 #ifdef CONFIG_X86_MCE
652 add_preferred_console("hvc", 0, NULL
);
654 pm_power_off
= lguest_power_off
;