2 * Lguest specific paravirt-ops implementation
4 * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 #include <linux/kernel.h>
22 #include <linux/start_kernel.h>
23 #include <linux/string.h>
24 #include <linux/console.h>
25 #include <linux/screen_info.h>
26 #include <linux/irq.h>
27 #include <linux/interrupt.h>
28 #include <linux/lguest.h>
29 #include <linux/lguest_launcher.h>
30 #include <linux/lguest_bus.h>
31 #include <asm/paravirt.h>
32 #include <asm/param.h>
34 #include <asm/pgtable.h>
36 #include <asm/setup.h>
41 /* Declarations for definitions in lguest_guest.S */
42 extern char lguest_noirq_start
[], lguest_noirq_end
[];
43 extern const char lgstart_cli
[], lgend_cli
[];
44 extern const char lgstart_sti
[], lgend_sti
[];
45 extern const char lgstart_popf
[], lgend_popf
[];
46 extern const char lgstart_pushf
[], lgend_pushf
[];
47 extern const char lgstart_iret
[], lgend_iret
[];
48 extern void lguest_iret(void);
50 struct lguest_data lguest_data
= {
51 .hcall_status
= { [0 ... LHCALL_RING_SIZE
-1] = 0xFF },
52 .noirq_start
= (u32
)lguest_noirq_start
,
53 .noirq_end
= (u32
)lguest_noirq_end
,
54 .blocked_interrupts
= { 1 }, /* Block timer interrupts */
56 struct lguest_device_desc
*lguest_devices
;
57 static __initdata
const struct lguest_boot_info
*boot
= __va(0);
59 static enum paravirt_lazy_mode lazy_mode
;
60 static void lguest_lazy_mode(enum paravirt_lazy_mode mode
)
62 if (mode
== PARAVIRT_LAZY_FLUSH
) {
63 if (unlikely(lazy_mode
!= PARAVIRT_LAZY_NONE
))
64 hcall(LHCALL_FLUSH_ASYNC
, 0, 0, 0);
67 if (mode
== PARAVIRT_LAZY_NONE
)
68 hcall(LHCALL_FLUSH_ASYNC
, 0, 0, 0);
72 static void lazy_hcall(unsigned long call
,
77 if (lazy_mode
== PARAVIRT_LAZY_NONE
)
78 hcall(call
, arg1
, arg2
, arg3
);
80 async_hcall(call
, arg1
, arg2
, arg3
);
83 void async_hcall(unsigned long call
,
84 unsigned long arg1
, unsigned long arg2
, unsigned long arg3
)
86 /* Note: This code assumes we're uniprocessor. */
87 static unsigned int next_call
;
90 local_irq_save(flags
);
91 if (lguest_data
.hcall_status
[next_call
] != 0xFF) {
92 /* Table full, so do normal hcall which will flush table. */
93 hcall(call
, arg1
, arg2
, arg3
);
95 lguest_data
.hcalls
[next_call
].eax
= call
;
96 lguest_data
.hcalls
[next_call
].edx
= arg1
;
97 lguest_data
.hcalls
[next_call
].ebx
= arg2
;
98 lguest_data
.hcalls
[next_call
].ecx
= arg3
;
99 /* Make sure host sees arguments before "valid" flag. */
101 lguest_data
.hcall_status
[next_call
] = 0;
102 if (++next_call
== LHCALL_RING_SIZE
)
105 local_irq_restore(flags
);
108 void lguest_send_dma(unsigned long key
, struct lguest_dma
*dma
)
111 hcall(LHCALL_SEND_DMA
, key
, __pa(dma
), 0);
114 int lguest_bind_dma(unsigned long key
, struct lguest_dma
*dmas
,
115 unsigned int num
, u8 irq
)
117 if (!hcall(LHCALL_BIND_DMA
, key
, __pa(dmas
), (num
<< 8) | irq
))
122 void lguest_unbind_dma(unsigned long key
, struct lguest_dma
*dmas
)
124 hcall(LHCALL_BIND_DMA
, key
, __pa(dmas
), 0);
127 /* For guests, device memory can be used as normal memory, so we cast away the
128 * __iomem to quieten sparse. */
129 void *lguest_map(unsigned long phys_addr
, unsigned long pages
)
131 return (__force
void *)ioremap(phys_addr
, PAGE_SIZE
*pages
);
134 void lguest_unmap(void *addr
)
136 iounmap((__force
void __iomem
*)addr
);
139 static unsigned long save_fl(void)
141 return lguest_data
.irq_enabled
;
144 static void restore_fl(unsigned long flags
)
146 /* FIXME: Check if interrupt pending... */
147 lguest_data
.irq_enabled
= flags
;
150 static void irq_disable(void)
152 lguest_data
.irq_enabled
= 0;
155 static void irq_enable(void)
157 /* FIXME: Check if interrupt pending... */
158 lguest_data
.irq_enabled
= X86_EFLAGS_IF
;
161 static void lguest_write_idt_entry(struct desc_struct
*dt
,
162 int entrynum
, u32 low
, u32 high
)
164 write_dt_entry(dt
, entrynum
, low
, high
);
165 hcall(LHCALL_LOAD_IDT_ENTRY
, entrynum
, low
, high
);
168 static void lguest_load_idt(const struct Xgt_desc_struct
*desc
)
171 struct desc_struct
*idt
= (void *)desc
->address
;
173 for (i
= 0; i
< (desc
->size
+1)/8; i
++)
174 hcall(LHCALL_LOAD_IDT_ENTRY
, i
, idt
[i
].a
, idt
[i
].b
);
177 static void lguest_load_gdt(const struct Xgt_desc_struct
*desc
)
179 BUG_ON((desc
->size
+1)/8 != GDT_ENTRIES
);
180 hcall(LHCALL_LOAD_GDT
, __pa(desc
->address
), GDT_ENTRIES
, 0);
183 static void lguest_write_gdt_entry(struct desc_struct
*dt
,
184 int entrynum
, u32 low
, u32 high
)
186 write_dt_entry(dt
, entrynum
, low
, high
);
187 hcall(LHCALL_LOAD_GDT
, __pa(dt
), GDT_ENTRIES
, 0);
190 static void lguest_load_tls(struct thread_struct
*t
, unsigned int cpu
)
192 lazy_hcall(LHCALL_LOAD_TLS
, __pa(&t
->tls_array
), cpu
, 0);
195 static void lguest_set_ldt(const void *addr
, unsigned entries
)
199 static void lguest_load_tr_desc(void)
203 static void lguest_cpuid(unsigned int *eax
, unsigned int *ebx
,
204 unsigned int *ecx
, unsigned int *edx
)
208 native_cpuid(eax
, ebx
, ecx
, edx
);
210 case 1: /* Basic feature request. */
211 /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
213 /* Similarly: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */
215 /* Host wants to know when we flush kernel pages: set PGE. */
219 /* Futureproof this a little: if they ask how much extended
220 * processor information, limit it to known fields. */
221 if (*eax
> 0x80000008)
227 static unsigned long current_cr0
, current_cr3
;
228 static void lguest_write_cr0(unsigned long val
)
230 lazy_hcall(LHCALL_TS
, val
& 8, 0, 0);
234 static unsigned long lguest_read_cr0(void)
239 static void lguest_clts(void)
241 lazy_hcall(LHCALL_TS
, 0, 0, 0);
245 static unsigned long lguest_read_cr2(void)
247 return lguest_data
.cr2
;
250 static void lguest_write_cr3(unsigned long cr3
)
252 lazy_hcall(LHCALL_NEW_PGTABLE
, cr3
, 0, 0);
256 static unsigned long lguest_read_cr3(void)
261 /* Used to enable/disable PGE, but we don't care. */
262 static unsigned long lguest_read_cr4(void)
267 static void lguest_write_cr4(unsigned long val
)
271 static void lguest_set_pte_at(struct mm_struct
*mm
, unsigned long addr
,
272 pte_t
*ptep
, pte_t pteval
)
275 lazy_hcall(LHCALL_SET_PTE
, __pa(mm
->pgd
), addr
, pteval
.pte_low
);
278 /* We only support two-level pagetables at the moment. */
279 static void lguest_set_pmd(pmd_t
*pmdp
, pmd_t pmdval
)
282 lazy_hcall(LHCALL_SET_PMD
, __pa(pmdp
)&PAGE_MASK
,
283 (__pa(pmdp
)&(PAGE_SIZE
-1))/4, 0);
286 /* FIXME: Eliminate all callers of this. */
287 static void lguest_set_pte(pte_t
*ptep
, pte_t pteval
)
290 /* Don't bother with hypercall before initial setup. */
292 lazy_hcall(LHCALL_FLUSH_TLB
, 1, 0, 0);
295 static void lguest_flush_tlb_single(unsigned long addr
)
297 /* Simply set it to zero, and it will fault back in. */
298 lazy_hcall(LHCALL_SET_PTE
, current_cr3
, addr
, 0);
301 static void lguest_flush_tlb_user(void)
303 lazy_hcall(LHCALL_FLUSH_TLB
, 0, 0, 0);
306 static void lguest_flush_tlb_kernel(void)
308 lazy_hcall(LHCALL_FLUSH_TLB
, 1, 0, 0);
311 static void disable_lguest_irq(unsigned int irq
)
313 set_bit(irq
, lguest_data
.blocked_interrupts
);
316 static void enable_lguest_irq(unsigned int irq
)
318 clear_bit(irq
, lguest_data
.blocked_interrupts
);
319 /* FIXME: If it's pending? */
322 static struct irq_chip lguest_irq_controller
= {
324 .mask
= disable_lguest_irq
,
325 .mask_ack
= disable_lguest_irq
,
326 .unmask
= enable_lguest_irq
,
329 static void __init
lguest_init_IRQ(void)
333 for (i
= 0; i
< LGUEST_IRQS
; i
++) {
334 int vector
= FIRST_EXTERNAL_VECTOR
+ i
;
335 if (vector
!= SYSCALL_VECTOR
) {
336 set_intr_gate(vector
, interrupt
[i
]);
337 set_irq_chip_and_handler(i
, &lguest_irq_controller
,
341 irq_ctx_init(smp_processor_id());
344 static unsigned long lguest_get_wallclock(void)
346 return hcall(LHCALL_GET_WALLCLOCK
, 0, 0, 0);
349 static void lguest_time_irq(unsigned int irq
, struct irq_desc
*desc
)
351 do_timer(hcall(LHCALL_TIMER_READ
, 0, 0, 0));
352 update_process_times(user_mode_vm(get_irq_regs()));
355 static u64 sched_clock_base
;
356 static void lguest_time_init(void)
358 set_irq_handler(0, lguest_time_irq
);
359 hcall(LHCALL_TIMER_READ
, 0, 0, 0);
360 sched_clock_base
= jiffies_64
;
361 enable_lguest_irq(0);
364 static unsigned long long lguest_sched_clock(void)
366 return (jiffies_64
- sched_clock_base
) * (1000000000 / HZ
);
369 static void lguest_load_esp0(struct tss_struct
*tss
,
370 struct thread_struct
*thread
)
372 lazy_hcall(LHCALL_SET_STACK
, __KERNEL_DS
|0x1, thread
->esp0
,
373 THREAD_SIZE
/PAGE_SIZE
);
376 static void lguest_set_debugreg(int regno
, unsigned long value
)
378 /* FIXME: Implement */
381 static void lguest_wbinvd(void)
385 #ifdef CONFIG_X86_LOCAL_APIC
386 static void lguest_apic_write(unsigned long reg
, unsigned long v
)
390 static unsigned long lguest_apic_read(unsigned long reg
)
396 static void lguest_safe_halt(void)
398 hcall(LHCALL_HALT
, 0, 0, 0);
401 static void lguest_power_off(void)
403 hcall(LHCALL_CRASH
, __pa("Power down"), 0, 0);
406 static int lguest_panic(struct notifier_block
*nb
, unsigned long l
, void *p
)
408 hcall(LHCALL_CRASH
, __pa(p
), 0, 0);
412 static struct notifier_block paniced
= {
413 .notifier_call
= lguest_panic
416 static __init
char *lguest_memory_setup(void)
418 /* We do this here because lockcheck barfs if before start_kernel */
419 atomic_notifier_chain_register(&panic_notifier_list
, &paniced
);
422 add_memory_region(0, PFN_PHYS(boot
->max_pfn
), E820_RAM
);
426 static const struct lguest_insns
428 const char *start
, *end
;
430 [PARAVIRT_PATCH(irq_disable
)] = { lgstart_cli
, lgend_cli
},
431 [PARAVIRT_PATCH(irq_enable
)] = { lgstart_sti
, lgend_sti
},
432 [PARAVIRT_PATCH(restore_fl
)] = { lgstart_popf
, lgend_popf
},
433 [PARAVIRT_PATCH(save_fl
)] = { lgstart_pushf
, lgend_pushf
},
435 static unsigned lguest_patch(u8 type
, u16 clobber
, void *insns
, unsigned len
)
437 unsigned int insn_len
;
439 /* Don't touch it if we don't have a replacement */
440 if (type
>= ARRAY_SIZE(lguest_insns
) || !lguest_insns
[type
].start
)
441 return paravirt_patch_default(type
, clobber
, insns
, len
);
443 insn_len
= lguest_insns
[type
].end
- lguest_insns
[type
].start
;
445 /* Similarly if we can't fit replacement. */
447 return paravirt_patch_default(type
, clobber
, insns
, len
);
449 memcpy(insns
, lguest_insns
[type
].start
, insn_len
);
453 __init
void lguest_init(void)
455 paravirt_ops
.name
= "lguest";
456 paravirt_ops
.paravirt_enabled
= 1;
457 paravirt_ops
.kernel_rpl
= 1;
459 paravirt_ops
.save_fl
= save_fl
;
460 paravirt_ops
.restore_fl
= restore_fl
;
461 paravirt_ops
.irq_disable
= irq_disable
;
462 paravirt_ops
.irq_enable
= irq_enable
;
463 paravirt_ops
.load_gdt
= lguest_load_gdt
;
464 paravirt_ops
.memory_setup
= lguest_memory_setup
;
465 paravirt_ops
.cpuid
= lguest_cpuid
;
466 paravirt_ops
.write_cr3
= lguest_write_cr3
;
467 paravirt_ops
.flush_tlb_user
= lguest_flush_tlb_user
;
468 paravirt_ops
.flush_tlb_single
= lguest_flush_tlb_single
;
469 paravirt_ops
.flush_tlb_kernel
= lguest_flush_tlb_kernel
;
470 paravirt_ops
.set_pte
= lguest_set_pte
;
471 paravirt_ops
.set_pte_at
= lguest_set_pte_at
;
472 paravirt_ops
.set_pmd
= lguest_set_pmd
;
473 #ifdef CONFIG_X86_LOCAL_APIC
474 paravirt_ops
.apic_write
= lguest_apic_write
;
475 paravirt_ops
.apic_write_atomic
= lguest_apic_write
;
476 paravirt_ops
.apic_read
= lguest_apic_read
;
478 paravirt_ops
.load_idt
= lguest_load_idt
;
479 paravirt_ops
.iret
= lguest_iret
;
480 paravirt_ops
.load_esp0
= lguest_load_esp0
;
481 paravirt_ops
.load_tr_desc
= lguest_load_tr_desc
;
482 paravirt_ops
.set_ldt
= lguest_set_ldt
;
483 paravirt_ops
.load_tls
= lguest_load_tls
;
484 paravirt_ops
.set_debugreg
= lguest_set_debugreg
;
485 paravirt_ops
.clts
= lguest_clts
;
486 paravirt_ops
.read_cr0
= lguest_read_cr0
;
487 paravirt_ops
.write_cr0
= lguest_write_cr0
;
488 paravirt_ops
.init_IRQ
= lguest_init_IRQ
;
489 paravirt_ops
.read_cr2
= lguest_read_cr2
;
490 paravirt_ops
.read_cr3
= lguest_read_cr3
;
491 paravirt_ops
.read_cr4
= lguest_read_cr4
;
492 paravirt_ops
.write_cr4
= lguest_write_cr4
;
493 paravirt_ops
.write_gdt_entry
= lguest_write_gdt_entry
;
494 paravirt_ops
.write_idt_entry
= lguest_write_idt_entry
;
495 paravirt_ops
.patch
= lguest_patch
;
496 paravirt_ops
.safe_halt
= lguest_safe_halt
;
497 paravirt_ops
.get_wallclock
= lguest_get_wallclock
;
498 paravirt_ops
.time_init
= lguest_time_init
;
499 paravirt_ops
.set_lazy_mode
= lguest_lazy_mode
;
500 paravirt_ops
.wbinvd
= lguest_wbinvd
;
501 paravirt_ops
.sched_clock
= lguest_sched_clock
;
503 hcall(LHCALL_LGUEST_INIT
, __pa(&lguest_data
), 0, 0);
504 strncpy(boot_command_line
, boot
->cmdline
, COMMAND_LINE_SIZE
);
506 /* We use top of mem for initial pagetables. */
507 init_pg_tables_end
= __pa(pg0
);
509 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS
) : "memory");
511 reserve_top_address(lguest_data
.reserve_mem
);
515 paravirt_disable_iospace();
517 cpu_detect(&new_cpu_data
);
518 /* head.S usually sets up the first capability word, so do it here. */
519 new_cpu_data
.x86_capability
[0] = cpuid_edx(1);
521 /* Math is always hard! */
522 new_cpu_data
.hard_math
= 1;
524 #ifdef CONFIG_X86_MCE
533 add_preferred_console("hvc", 0, NULL
);
535 if (boot
->initrd_size
) {
536 /* We stash this at top of memory. */
537 INITRD_START
= boot
->max_pfn
*PAGE_SIZE
- boot
->initrd_size
;
538 INITRD_SIZE
= boot
->initrd_size
;
542 pm_power_off
= lguest_power_off
;