2 * Lguest specific paravirt-ops implementation
4 * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 #include <linux/kernel.h>
22 #include <linux/start_kernel.h>
23 #include <linux/string.h>
24 #include <linux/console.h>
25 #include <linux/screen_info.h>
26 #include <linux/irq.h>
27 #include <linux/interrupt.h>
28 #include <linux/clocksource.h>
29 #include <linux/clockchips.h>
30 #include <linux/lguest.h>
31 #include <linux/lguest_launcher.h>
32 #include <linux/lguest_bus.h>
33 #include <asm/paravirt.h>
34 #include <asm/param.h>
36 #include <asm/pgtable.h>
38 #include <asm/setup.h>
43 /* Declarations for definitions in lguest_guest.S */
44 extern char lguest_noirq_start
[], lguest_noirq_end
[];
45 extern const char lgstart_cli
[], lgend_cli
[];
46 extern const char lgstart_sti
[], lgend_sti
[];
47 extern const char lgstart_popf
[], lgend_popf
[];
48 extern const char lgstart_pushf
[], lgend_pushf
[];
49 extern const char lgstart_iret
[], lgend_iret
[];
50 extern void lguest_iret(void);
52 struct lguest_data lguest_data
= {
53 .hcall_status
= { [0 ... LHCALL_RING_SIZE
-1] = 0xFF },
54 .noirq_start
= (u32
)lguest_noirq_start
,
55 .noirq_end
= (u32
)lguest_noirq_end
,
56 .blocked_interrupts
= { 1 }, /* Block timer interrupts */
58 struct lguest_device_desc
*lguest_devices
;
59 static cycle_t clock_base
;
61 static enum paravirt_lazy_mode lazy_mode
;
62 static void lguest_lazy_mode(enum paravirt_lazy_mode mode
)
64 if (mode
== PARAVIRT_LAZY_FLUSH
) {
65 if (unlikely(lazy_mode
!= PARAVIRT_LAZY_NONE
))
66 hcall(LHCALL_FLUSH_ASYNC
, 0, 0, 0);
69 if (mode
== PARAVIRT_LAZY_NONE
)
70 hcall(LHCALL_FLUSH_ASYNC
, 0, 0, 0);
74 static void lazy_hcall(unsigned long call
,
79 if (lazy_mode
== PARAVIRT_LAZY_NONE
)
80 hcall(call
, arg1
, arg2
, arg3
);
82 async_hcall(call
, arg1
, arg2
, arg3
);
85 void async_hcall(unsigned long call
,
86 unsigned long arg1
, unsigned long arg2
, unsigned long arg3
)
88 /* Note: This code assumes we're uniprocessor. */
89 static unsigned int next_call
;
92 local_irq_save(flags
);
93 if (lguest_data
.hcall_status
[next_call
] != 0xFF) {
94 /* Table full, so do normal hcall which will flush table. */
95 hcall(call
, arg1
, arg2
, arg3
);
97 lguest_data
.hcalls
[next_call
].eax
= call
;
98 lguest_data
.hcalls
[next_call
].edx
= arg1
;
99 lguest_data
.hcalls
[next_call
].ebx
= arg2
;
100 lguest_data
.hcalls
[next_call
].ecx
= arg3
;
101 /* Make sure host sees arguments before "valid" flag. */
103 lguest_data
.hcall_status
[next_call
] = 0;
104 if (++next_call
== LHCALL_RING_SIZE
)
107 local_irq_restore(flags
);
110 void lguest_send_dma(unsigned long key
, struct lguest_dma
*dma
)
113 hcall(LHCALL_SEND_DMA
, key
, __pa(dma
), 0);
116 int lguest_bind_dma(unsigned long key
, struct lguest_dma
*dmas
,
117 unsigned int num
, u8 irq
)
119 if (!hcall(LHCALL_BIND_DMA
, key
, __pa(dmas
), (num
<< 8) | irq
))
124 void lguest_unbind_dma(unsigned long key
, struct lguest_dma
*dmas
)
126 hcall(LHCALL_BIND_DMA
, key
, __pa(dmas
), 0);
129 /* For guests, device memory can be used as normal memory, so we cast away the
130 * __iomem to quieten sparse. */
131 void *lguest_map(unsigned long phys_addr
, unsigned long pages
)
133 return (__force
void *)ioremap(phys_addr
, PAGE_SIZE
*pages
);
136 void lguest_unmap(void *addr
)
138 iounmap((__force
void __iomem
*)addr
);
141 static unsigned long save_fl(void)
143 return lguest_data
.irq_enabled
;
146 static void restore_fl(unsigned long flags
)
148 /* FIXME: Check if interrupt pending... */
149 lguest_data
.irq_enabled
= flags
;
152 static void irq_disable(void)
154 lguest_data
.irq_enabled
= 0;
157 static void irq_enable(void)
159 /* FIXME: Check if interrupt pending... */
160 lguest_data
.irq_enabled
= X86_EFLAGS_IF
;
163 static void lguest_write_idt_entry(struct desc_struct
*dt
,
164 int entrynum
, u32 low
, u32 high
)
166 write_dt_entry(dt
, entrynum
, low
, high
);
167 hcall(LHCALL_LOAD_IDT_ENTRY
, entrynum
, low
, high
);
170 static void lguest_load_idt(const struct Xgt_desc_struct
*desc
)
173 struct desc_struct
*idt
= (void *)desc
->address
;
175 for (i
= 0; i
< (desc
->size
+1)/8; i
++)
176 hcall(LHCALL_LOAD_IDT_ENTRY
, i
, idt
[i
].a
, idt
[i
].b
);
179 static void lguest_load_gdt(const struct Xgt_desc_struct
*desc
)
181 BUG_ON((desc
->size
+1)/8 != GDT_ENTRIES
);
182 hcall(LHCALL_LOAD_GDT
, __pa(desc
->address
), GDT_ENTRIES
, 0);
185 static void lguest_write_gdt_entry(struct desc_struct
*dt
,
186 int entrynum
, u32 low
, u32 high
)
188 write_dt_entry(dt
, entrynum
, low
, high
);
189 hcall(LHCALL_LOAD_GDT
, __pa(dt
), GDT_ENTRIES
, 0);
192 static void lguest_load_tls(struct thread_struct
*t
, unsigned int cpu
)
194 lazy_hcall(LHCALL_LOAD_TLS
, __pa(&t
->tls_array
), cpu
, 0);
197 static void lguest_set_ldt(const void *addr
, unsigned entries
)
201 static void lguest_load_tr_desc(void)
205 static void lguest_cpuid(unsigned int *eax
, unsigned int *ebx
,
206 unsigned int *ecx
, unsigned int *edx
)
210 native_cpuid(eax
, ebx
, ecx
, edx
);
212 case 1: /* Basic feature request. */
213 /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
215 /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */
217 /* Host wants to know when we flush kernel pages: set PGE. */
221 /* Futureproof this a little: if they ask how much extended
222 * processor information, limit it to known fields. */
223 if (*eax
> 0x80000008)
229 static unsigned long current_cr0
, current_cr3
;
230 static void lguest_write_cr0(unsigned long val
)
232 lazy_hcall(LHCALL_TS
, val
& 8, 0, 0);
236 static unsigned long lguest_read_cr0(void)
241 static void lguest_clts(void)
243 lazy_hcall(LHCALL_TS
, 0, 0, 0);
247 static unsigned long lguest_read_cr2(void)
249 return lguest_data
.cr2
;
252 static void lguest_write_cr3(unsigned long cr3
)
254 lazy_hcall(LHCALL_NEW_PGTABLE
, cr3
, 0, 0);
258 static unsigned long lguest_read_cr3(void)
263 /* Used to enable/disable PGE, but we don't care. */
264 static unsigned long lguest_read_cr4(void)
269 static void lguest_write_cr4(unsigned long val
)
273 static void lguest_set_pte_at(struct mm_struct
*mm
, unsigned long addr
,
274 pte_t
*ptep
, pte_t pteval
)
277 lazy_hcall(LHCALL_SET_PTE
, __pa(mm
->pgd
), addr
, pteval
.pte_low
);
280 /* We only support two-level pagetables at the moment. */
281 static void lguest_set_pmd(pmd_t
*pmdp
, pmd_t pmdval
)
284 lazy_hcall(LHCALL_SET_PMD
, __pa(pmdp
)&PAGE_MASK
,
285 (__pa(pmdp
)&(PAGE_SIZE
-1))/4, 0);
288 /* FIXME: Eliminate all callers of this. */
289 static void lguest_set_pte(pte_t
*ptep
, pte_t pteval
)
292 /* Don't bother with hypercall before initial setup. */
294 lazy_hcall(LHCALL_FLUSH_TLB
, 1, 0, 0);
297 static void lguest_flush_tlb_single(unsigned long addr
)
299 /* Simply set it to zero, and it will fault back in. */
300 lazy_hcall(LHCALL_SET_PTE
, current_cr3
, addr
, 0);
303 static void lguest_flush_tlb_user(void)
305 lazy_hcall(LHCALL_FLUSH_TLB
, 0, 0, 0);
308 static void lguest_flush_tlb_kernel(void)
310 lazy_hcall(LHCALL_FLUSH_TLB
, 1, 0, 0);
313 static void disable_lguest_irq(unsigned int irq
)
315 set_bit(irq
, lguest_data
.blocked_interrupts
);
318 static void enable_lguest_irq(unsigned int irq
)
320 clear_bit(irq
, lguest_data
.blocked_interrupts
);
321 /* FIXME: If it's pending? */
324 static struct irq_chip lguest_irq_controller
= {
326 .mask
= disable_lguest_irq
,
327 .mask_ack
= disable_lguest_irq
,
328 .unmask
= enable_lguest_irq
,
331 static void __init
lguest_init_IRQ(void)
335 for (i
= 0; i
< LGUEST_IRQS
; i
++) {
336 int vector
= FIRST_EXTERNAL_VECTOR
+ i
;
337 if (vector
!= SYSCALL_VECTOR
) {
338 set_intr_gate(vector
, interrupt
[i
]);
339 set_irq_chip_and_handler(i
, &lguest_irq_controller
,
343 irq_ctx_init(smp_processor_id());
346 static unsigned long lguest_get_wallclock(void)
348 return hcall(LHCALL_GET_WALLCLOCK
, 0, 0, 0);
351 static cycle_t
lguest_clock_read(void)
353 if (lguest_data
.tsc_khz
)
354 return native_read_tsc();
359 /* This is what we tell the kernel is our clocksource. */
360 static struct clocksource lguest_clock
= {
363 .read
= lguest_clock_read
,
366 static unsigned long long lguest_sched_clock(void)
368 return cyc2ns(&lguest_clock
, lguest_clock_read() - clock_base
);
371 /* We also need a "struct clock_event_device": Linux asks us to set it to go
372 * off some time in the future. Actually, James Morris figured all this out, I
373 * just applied the patch. */
374 static int lguest_clockevent_set_next_event(unsigned long delta
,
375 struct clock_event_device
*evt
)
377 if (delta
< LG_CLOCK_MIN_DELTA
) {
378 if (printk_ratelimit())
379 printk(KERN_DEBUG
"%s: small delta %lu ns\n",
380 __FUNCTION__
, delta
);
383 hcall(LHCALL_SET_CLOCKEVENT
, delta
, 0, 0);
387 static void lguest_clockevent_set_mode(enum clock_event_mode mode
,
388 struct clock_event_device
*evt
)
391 case CLOCK_EVT_MODE_UNUSED
:
392 case CLOCK_EVT_MODE_SHUTDOWN
:
393 /* A 0 argument shuts the clock down. */
394 hcall(LHCALL_SET_CLOCKEVENT
, 0, 0, 0);
396 case CLOCK_EVT_MODE_ONESHOT
:
397 /* This is what we expect. */
399 case CLOCK_EVT_MODE_PERIODIC
:
401 case CLOCK_EVT_MODE_RESUME
:
406 /* This describes our primitive timer chip. */
407 static struct clock_event_device lguest_clockevent
= {
409 .features
= CLOCK_EVT_FEAT_ONESHOT
,
410 .set_next_event
= lguest_clockevent_set_next_event
,
411 .set_mode
= lguest_clockevent_set_mode
,
415 .min_delta_ns
= LG_CLOCK_MIN_DELTA
,
416 .max_delta_ns
= LG_CLOCK_MAX_DELTA
,
419 /* This is the Guest timer interrupt handler (hardware interrupt 0). We just
420 * call the clockevent infrastructure and it does whatever needs doing. */
421 static void lguest_time_irq(unsigned int irq
, struct irq_desc
*desc
)
425 /* Don't interrupt us while this is running. */
426 local_irq_save(flags
);
427 lguest_clockevent
.event_handler(&lguest_clockevent
);
428 local_irq_restore(flags
);
431 static void lguest_time_init(void)
433 set_irq_handler(0, lguest_time_irq
);
435 /* We use the TSC if the Host tells us we can, otherwise a dumb
436 * jiffies-based clock. */
437 if (lguest_data
.tsc_khz
) {
438 lguest_clock
.shift
= 22;
439 lguest_clock
.mult
= clocksource_khz2mult(lguest_data
.tsc_khz
,
441 lguest_clock
.mask
= CLOCKSOURCE_MASK(64);
442 lguest_clock
.flags
= CLOCK_SOURCE_IS_CONTINUOUS
;
444 /* To understand this, start at kernel/time/jiffies.c... */
445 lguest_clock
.shift
= 8;
446 lguest_clock
.mult
= (((u64
)NSEC_PER_SEC
<<8)/ACTHZ
) << 8;
447 lguest_clock
.mask
= CLOCKSOURCE_MASK(32);
449 clock_base
= lguest_clock_read();
450 clocksource_register(&lguest_clock
);
452 /* We can't set cpumask in the initializer: damn C limitations! */
453 lguest_clockevent
.cpumask
= cpumask_of_cpu(0);
454 clockevents_register_device(&lguest_clockevent
);
456 enable_lguest_irq(0);
459 static void lguest_load_esp0(struct tss_struct
*tss
,
460 struct thread_struct
*thread
)
462 lazy_hcall(LHCALL_SET_STACK
, __KERNEL_DS
|0x1, thread
->esp0
,
463 THREAD_SIZE
/PAGE_SIZE
);
466 static void lguest_set_debugreg(int regno
, unsigned long value
)
468 /* FIXME: Implement */
471 static void lguest_wbinvd(void)
475 #ifdef CONFIG_X86_LOCAL_APIC
476 static void lguest_apic_write(unsigned long reg
, unsigned long v
)
480 static unsigned long lguest_apic_read(unsigned long reg
)
486 static void lguest_safe_halt(void)
488 hcall(LHCALL_HALT
, 0, 0, 0);
491 static void lguest_power_off(void)
493 hcall(LHCALL_CRASH
, __pa("Power down"), 0, 0);
496 static int lguest_panic(struct notifier_block
*nb
, unsigned long l
, void *p
)
498 hcall(LHCALL_CRASH
, __pa(p
), 0, 0);
502 static struct notifier_block paniced
= {
503 .notifier_call
= lguest_panic
506 static __init
char *lguest_memory_setup(void)
508 /* We do this here because lockcheck barfs if before start_kernel */
509 atomic_notifier_chain_register(&panic_notifier_list
, &paniced
);
511 add_memory_region(E820_MAP
->addr
, E820_MAP
->size
, E820_MAP
->type
);
515 static const struct lguest_insns
517 const char *start
, *end
;
519 [PARAVIRT_PATCH(irq_disable
)] = { lgstart_cli
, lgend_cli
},
520 [PARAVIRT_PATCH(irq_enable
)] = { lgstart_sti
, lgend_sti
},
521 [PARAVIRT_PATCH(restore_fl
)] = { lgstart_popf
, lgend_popf
},
522 [PARAVIRT_PATCH(save_fl
)] = { lgstart_pushf
, lgend_pushf
},
524 static unsigned lguest_patch(u8 type
, u16 clobber
, void *insns
, unsigned len
)
526 unsigned int insn_len
;
528 /* Don't touch it if we don't have a replacement */
529 if (type
>= ARRAY_SIZE(lguest_insns
) || !lguest_insns
[type
].start
)
530 return paravirt_patch_default(type
, clobber
, insns
, len
);
532 insn_len
= lguest_insns
[type
].end
- lguest_insns
[type
].start
;
534 /* Similarly if we can't fit replacement. */
536 return paravirt_patch_default(type
, clobber
, insns
, len
);
538 memcpy(insns
, lguest_insns
[type
].start
, insn_len
);
542 __init
void lguest_init(void *boot
)
544 /* Copy boot parameters first. */
545 memcpy(&boot_params
, boot
, PARAM_SIZE
);
546 memcpy(boot_command_line
, __va(boot_params
.hdr
.cmd_line_ptr
),
549 paravirt_ops
.name
= "lguest";
550 paravirt_ops
.paravirt_enabled
= 1;
551 paravirt_ops
.kernel_rpl
= 1;
553 paravirt_ops
.save_fl
= save_fl
;
554 paravirt_ops
.restore_fl
= restore_fl
;
555 paravirt_ops
.irq_disable
= irq_disable
;
556 paravirt_ops
.irq_enable
= irq_enable
;
557 paravirt_ops
.load_gdt
= lguest_load_gdt
;
558 paravirt_ops
.memory_setup
= lguest_memory_setup
;
559 paravirt_ops
.cpuid
= lguest_cpuid
;
560 paravirt_ops
.write_cr3
= lguest_write_cr3
;
561 paravirt_ops
.flush_tlb_user
= lguest_flush_tlb_user
;
562 paravirt_ops
.flush_tlb_single
= lguest_flush_tlb_single
;
563 paravirt_ops
.flush_tlb_kernel
= lguest_flush_tlb_kernel
;
564 paravirt_ops
.set_pte
= lguest_set_pte
;
565 paravirt_ops
.set_pte_at
= lguest_set_pte_at
;
566 paravirt_ops
.set_pmd
= lguest_set_pmd
;
567 #ifdef CONFIG_X86_LOCAL_APIC
568 paravirt_ops
.apic_write
= lguest_apic_write
;
569 paravirt_ops
.apic_write_atomic
= lguest_apic_write
;
570 paravirt_ops
.apic_read
= lguest_apic_read
;
572 paravirt_ops
.load_idt
= lguest_load_idt
;
573 paravirt_ops
.iret
= lguest_iret
;
574 paravirt_ops
.load_esp0
= lguest_load_esp0
;
575 paravirt_ops
.load_tr_desc
= lguest_load_tr_desc
;
576 paravirt_ops
.set_ldt
= lguest_set_ldt
;
577 paravirt_ops
.load_tls
= lguest_load_tls
;
578 paravirt_ops
.set_debugreg
= lguest_set_debugreg
;
579 paravirt_ops
.clts
= lguest_clts
;
580 paravirt_ops
.read_cr0
= lguest_read_cr0
;
581 paravirt_ops
.write_cr0
= lguest_write_cr0
;
582 paravirt_ops
.init_IRQ
= lguest_init_IRQ
;
583 paravirt_ops
.read_cr2
= lguest_read_cr2
;
584 paravirt_ops
.read_cr3
= lguest_read_cr3
;
585 paravirt_ops
.read_cr4
= lguest_read_cr4
;
586 paravirt_ops
.write_cr4
= lguest_write_cr4
;
587 paravirt_ops
.write_gdt_entry
= lguest_write_gdt_entry
;
588 paravirt_ops
.write_idt_entry
= lguest_write_idt_entry
;
589 paravirt_ops
.patch
= lguest_patch
;
590 paravirt_ops
.safe_halt
= lguest_safe_halt
;
591 paravirt_ops
.get_wallclock
= lguest_get_wallclock
;
592 paravirt_ops
.time_init
= lguest_time_init
;
593 paravirt_ops
.set_lazy_mode
= lguest_lazy_mode
;
594 paravirt_ops
.wbinvd
= lguest_wbinvd
;
595 paravirt_ops
.sched_clock
= lguest_sched_clock
;
597 hcall(LHCALL_LGUEST_INIT
, __pa(&lguest_data
), 0, 0);
599 /* We use top of mem for initial pagetables. */
600 init_pg_tables_end
= __pa(pg0
);
602 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS
) : "memory");
604 reserve_top_address(lguest_data
.reserve_mem
);
608 paravirt_disable_iospace();
610 cpu_detect(&new_cpu_data
);
611 /* head.S usually sets up the first capability word, so do it here. */
612 new_cpu_data
.x86_capability
[0] = cpuid_edx(1);
614 /* Math is always hard! */
615 new_cpu_data
.hard_math
= 1;
617 #ifdef CONFIG_X86_MCE
626 add_preferred_console("hvc", 0, NULL
);
628 pm_power_off
= lguest_power_off
;