]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/commitdiff
Merge commit 'origin/master' into next
authorBenjamin Herrenschmidt <benh@kernel.crashing.org>
Wed, 11 Nov 2009 23:59:04 +0000 (10:59 +1100)
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>
Wed, 11 Nov 2009 23:59:04 +0000 (10:59 +1100)
102 files changed:
arch/powerpc/Kconfig
arch/powerpc/include/asm/exception-64s.h
arch/powerpc/include/asm/hugetlb.h
arch/powerpc/include/asm/hw_irq.h
arch/powerpc/include/asm/irq.h
arch/powerpc/include/asm/kvm.h
arch/powerpc/include/asm/kvm_asm.h
arch/powerpc/include/asm/kvm_book3s.h [new file with mode: 0644]
arch/powerpc/include/asm/kvm_book3s_64_asm.h [new file with mode: 0644]
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/include/asm/kvm_ppc.h
arch/powerpc/include/asm/mmu-hash64.h
arch/powerpc/include/asm/mmu_context.h
arch/powerpc/include/asm/nvram.h
arch/powerpc/include/asm/paca.h
arch/powerpc/include/asm/page.h
arch/powerpc/include/asm/page_64.h
arch/powerpc/include/asm/pgalloc-32.h
arch/powerpc/include/asm/pgalloc-64.h
arch/powerpc/include/asm/pgalloc.h
arch/powerpc/include/asm/pgtable-ppc64.h
arch/powerpc/include/asm/pgtable.h
arch/powerpc/kernel/Makefile
arch/powerpc/kernel/asm-offsets.c
arch/powerpc/kernel/crash.c
arch/powerpc/kernel/exceptions-64s.S
arch/powerpc/kernel/head_64.S
arch/powerpc/kernel/irq.c
arch/powerpc/kernel/lparcfg.c
arch/powerpc/kernel/nvram_64.c
arch/powerpc/kernel/perf_callchain.c
arch/powerpc/kernel/ppc_ksyms.c
arch/powerpc/kernel/proc_powerpc.c [new file with mode: 0644]
arch/powerpc/kernel/proc_ppc64.c [deleted file]
arch/powerpc/kernel/rtas_flash.c
arch/powerpc/kernel/rtasd.c [new file with mode: 0644]
arch/powerpc/kernel/setup_64.c
arch/powerpc/kernel/time.c
arch/powerpc/kvm/Kconfig
arch/powerpc/kvm/Makefile
arch/powerpc/kvm/book3s.c [new file with mode: 0644]
arch/powerpc/kvm/book3s_32_mmu.c [new file with mode: 0644]
arch/powerpc/kvm/book3s_64_emulate.c [new file with mode: 0644]
arch/powerpc/kvm/book3s_64_exports.c [new file with mode: 0644]
arch/powerpc/kvm/book3s_64_interrupts.S [new file with mode: 0644]
arch/powerpc/kvm/book3s_64_mmu.c [new file with mode: 0644]
arch/powerpc/kvm/book3s_64_mmu_host.c [new file with mode: 0644]
arch/powerpc/kvm/book3s_64_rmhandlers.S [new file with mode: 0644]
arch/powerpc/kvm/book3s_64_slb.S [new file with mode: 0644]
arch/powerpc/kvm/booke.c
arch/powerpc/kvm/emulate.c
arch/powerpc/kvm/powerpc.c
arch/powerpc/kvm/timing.c
arch/powerpc/kvm/trace.h
arch/powerpc/mm/Makefile
arch/powerpc/mm/gup.c
arch/powerpc/mm/hash_utils_64.c
arch/powerpc/mm/hugetlbpage-hash64.c [new file with mode: 0644]
arch/powerpc/mm/hugetlbpage.c
arch/powerpc/mm/init_64.c
arch/powerpc/mm/mem.c
arch/powerpc/mm/mmu_context_hash64.c
arch/powerpc/mm/pgtable.c
arch/powerpc/mm/tlb_hash64.c
arch/powerpc/platforms/512x/mpc5121_ads_cpld.c
arch/powerpc/platforms/52xx/media5200.c
arch/powerpc/platforms/82xx/pq2ads-pci-pic.c
arch/powerpc/platforms/85xx/socrates_fpga_pic.c
arch/powerpc/platforms/86xx/gef_pic.c
arch/powerpc/platforms/8xx/m8xx_setup.c
arch/powerpc/platforms/Kconfig
arch/powerpc/platforms/cell/beat_interrupt.c
arch/powerpc/platforms/cell/spider-pic.c
arch/powerpc/platforms/chrp/Kconfig
arch/powerpc/platforms/chrp/setup.c
arch/powerpc/platforms/iseries/irq.c
arch/powerpc/platforms/powermac/pic.c
arch/powerpc/platforms/ps3/mm.c
arch/powerpc/platforms/pseries/Kconfig
arch/powerpc/platforms/pseries/Makefile
arch/powerpc/platforms/pseries/cmm.c
arch/powerpc/platforms/pseries/eeh_driver.c
arch/powerpc/platforms/pseries/reconfig.c
arch/powerpc/platforms/pseries/rtasd.c [deleted file]
arch/powerpc/platforms/pseries/scanlog.c
arch/powerpc/platforms/pseries/xics.c
arch/powerpc/sysdev/cpm1.c
arch/powerpc/sysdev/cpm2_pic.c
arch/powerpc/sysdev/fsl_msi.c
arch/powerpc/sysdev/i8259.c
arch/powerpc/sysdev/ipic.c
arch/powerpc/sysdev/mpc8xx_pic.c
arch/powerpc/sysdev/mpic.c
arch/powerpc/sysdev/mv64x60_pic.c
arch/powerpc/sysdev/qe_lib/qe_ic.c
arch/powerpc/sysdev/tsi108_pci.c
arch/powerpc/sysdev/uic.c
arch/powerpc/sysdev/xilinx_intc.c
drivers/macintosh/therm_adt746x.c
drivers/of/platform.c
fs/proc/proc_devtree.c
virt/kvm/kvm_main.c

index 2ba14e77296c8082e0670fbe6113ae9667286e39..61abde11a45ef9adc8f70db6b6fb6b50d490963b 100644 (file)
@@ -56,6 +56,16 @@ config IRQ_PER_CPU
        bool
        default y
 
+config NR_IRQS
+       int "Number of virtual interrupt numbers"
+       range 32 512
+       default "512"
+       help
+         This defines the number of virtual interrupt numbers the kernel
+         can manage. Virtual interrupt numbers are what you see in
+         /proc/interrupts. If you configure your system to have too few,
+         drivers will fail to load or worse - handle with care.
+
 config STACKTRACE_SUPPORT
        bool
        default y
@@ -199,19 +209,8 @@ config DEFAULT_UIMAGE
 config REDBOOT
        bool
 
-config HIBERNATE_32
-       bool
-       depends on (PPC_PMAC && !SMP) || BROKEN
-       default y
-
-config HIBERNATE_64
-       bool
-       depends on BROKEN || (PPC_PMAC64 && EXPERIMENTAL)
-       default y
-
 config ARCH_HIBERNATION_POSSIBLE
        bool
-       depends on (PPC64 && HIBERNATE_64) || (PPC32 && HIBERNATE_32)
        default y
 
 config ARCH_SUSPEND_POSSIBLE
@@ -378,6 +377,19 @@ config IRQ_ALL_CPUS
          CPU.  Generally saying Y is safe, although some problems have been
          reported with SMP Power Macintoshes with this option enabled.
 
+config SPARSE_IRQ
+       bool "Support sparse irq numbering"
+       default y
+       help
+         This enables support for sparse irqs. This is useful for distro
+         kernels that want to define a high CONFIG_NR_CPUS value but still
+         want to have low kernel memory footprint on smaller machines.
+
+         ( Sparse IRQs can also be beneficial on NUMA boxes, as they spread
+           out the irq_desc[] array in a more NUMA-friendly way. )
+
+         If you don't know what to do here, say Y.
+
 config NUMA
        bool "NUMA support"
        depends on PPC64
index a98653b262312c6b3ca73b92089e7391e61527c2..57c4000719959cd4409d109f13d5cb723e70e95c 100644 (file)
        .globl label##_pSeries;                         \
 label##_pSeries:                                       \
        HMT_MEDIUM;                                     \
+       DO_KVM  n;                                      \
        mtspr   SPRN_SPRG_SCRATCH0,r13;         /* save r13 */  \
        EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common)
 
@@ -170,6 +171,7 @@ label##_pSeries:                                    \
        .globl label##_pSeries;                                         \
 label##_pSeries:                                                       \
        HMT_MEDIUM;                                                     \
+       DO_KVM  n;                                                      \
        mtspr   SPRN_SPRG_SCRATCH0,r13; /* save r13 */                  \
        mfspr   r13,SPRN_SPRG_PACA;     /* get paca address into r13 */ \
        std     r9,PACA_EXGEN+EX_R9(r13);       /* save r9, r10 */      \
index b1dafb6a9743938fdf4bc63bc0b771400122bc83..5856a66ab4047e02c1ce8ed09a69b7c6a2b209be 100644 (file)
@@ -3,6 +3,10 @@
 
 #include <asm/page.h>
 
+pte_t *huge_pte_offset_and_shift(struct mm_struct *mm,
+                                unsigned long addr, unsigned *shift);
+
+void flush_dcache_icache_hugepage(struct page *page);
 
 int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
                           unsigned long len);
@@ -11,12 +15,6 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
                            unsigned long end, unsigned long floor,
                            unsigned long ceiling);
 
-void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
-                    pte_t *ptep, pte_t pte);
-
-pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
-                             pte_t *ptep);
-
 /*
  * The version of vma_mmu_pagesize() in arch/powerpc/mm/hugetlbpage.c needs
  * to override the version in mm/hugetlb.c
@@ -42,9 +40,26 @@ static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm)
 {
 }
 
+
+static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+                                  pte_t *ptep, pte_t pte)
+{
+       set_pte_at(mm, addr, ptep, pte);
+}
+
+static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
+                                           unsigned long addr, pte_t *ptep)
+{
+       unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
+       return __pte(old);
+}
+
 static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
                                         unsigned long addr, pte_t *ptep)
 {
+       pte_t pte;
+       pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep);
+       flush_tlb_page(vma, addr);
 }
 
 static inline int huge_pte_none(pte_t pte)
index abbc2aaaced5bbc68d5be5699e9123a35cd99fdd..9f4c9d4f5803d10638b159abf8202e4283e5bca6 100644 (file)
@@ -64,11 +64,6 @@ extern void iseries_handle_interrupts(void);
                get_paca()->hard_enabled = 0;   \
        } while(0)
 
-static inline int irqs_disabled_flags(unsigned long flags)
-{
-       return flags == 0;
-}
-
 #else
 
 #if defined(CONFIG_BOOKE)
index bbcd1aaf3dfdfe79b00cc31e00850378139b1439..c85a32f1a17fc0d88f092c469a6557d8e49f147f 100644 (file)
@@ -17,8 +17,6 @@
 #include <asm/atomic.h>
 
 
-#define get_irq_desc(irq) (&irq_desc[(irq)])
-
 /* Define a way to iterate across irqs. */
 #define for_each_irq(i) \
        for ((i) = 0; (i) < NR_IRQS; ++(i))
@@ -34,12 +32,15 @@ extern atomic_t ppc_n_lost_interrupts;
  */
 #define NO_IRQ_IGNORE          ((unsigned int)-1)
 
-/* Total number of virq in the platform (make it a CONFIG_* option ? */
-#define NR_IRQS                512
+/* Total number of virq in the platform */
+#define NR_IRQS                CONFIG_NR_IRQS
 
 /* Number of irqs reserved for the legacy controller */
 #define NUM_ISA_INTERRUPTS     16
 
+/* Same thing, used by the generic IRQ code */
+#define NR_IRQS_LEGACY         NUM_ISA_INTERRUPTS
+
 /* This type is the placeholder for a hardware interrupt number. It has to
  * be big enough to enclose whatever representation is used by a given
  * platform.
index bb2de6aa5ce0e59b1d7a851b385e563c1358619e..c9ca97f43bc1a55dbbfd3cf5d7f64bd64f884049 100644 (file)
@@ -46,6 +46,8 @@ struct kvm_regs {
 };
 
 struct kvm_sregs {
+       __u32 pvr;
+       char pad[1020];
 };
 
 struct kvm_fpu {
index 56bfae59837f761a03d4bb81a763281fbb224912..19ddb352fd0fccaede75e4dde9aafd34ecfc4a93 100644 (file)
 #define BOOKE_INTERRUPT_SPE_FP_ROUND 34
 #define BOOKE_INTERRUPT_PERFORMANCE_MONITOR 35
 
+/* book3s */
+
+#define BOOK3S_INTERRUPT_SYSTEM_RESET  0x100
+#define BOOK3S_INTERRUPT_MACHINE_CHECK 0x200
+#define BOOK3S_INTERRUPT_DATA_STORAGE  0x300
+#define BOOK3S_INTERRUPT_DATA_SEGMENT  0x380
+#define BOOK3S_INTERRUPT_INST_STORAGE  0x400
+#define BOOK3S_INTERRUPT_INST_SEGMENT  0x480
+#define BOOK3S_INTERRUPT_EXTERNAL      0x500
+#define BOOK3S_INTERRUPT_ALIGNMENT     0x600
+#define BOOK3S_INTERRUPT_PROGRAM       0x700
+#define BOOK3S_INTERRUPT_FP_UNAVAIL    0x800
+#define BOOK3S_INTERRUPT_DECREMENTER   0x900
+#define BOOK3S_INTERRUPT_SYSCALL       0xc00
+#define BOOK3S_INTERRUPT_TRACE         0xd00
+#define BOOK3S_INTERRUPT_PERFMON       0xf00
+#define BOOK3S_INTERRUPT_ALTIVEC       0xf20
+#define BOOK3S_INTERRUPT_VSX           0xf40
+
+#define BOOK3S_IRQPRIO_SYSTEM_RESET            0
+#define BOOK3S_IRQPRIO_DATA_SEGMENT            1
+#define BOOK3S_IRQPRIO_INST_SEGMENT            2
+#define BOOK3S_IRQPRIO_DATA_STORAGE            3
+#define BOOK3S_IRQPRIO_INST_STORAGE            4
+#define BOOK3S_IRQPRIO_ALIGNMENT               5
+#define BOOK3S_IRQPRIO_PROGRAM                 6
+#define BOOK3S_IRQPRIO_FP_UNAVAIL              7
+#define BOOK3S_IRQPRIO_ALTIVEC                 8
+#define BOOK3S_IRQPRIO_VSX                     9
+#define BOOK3S_IRQPRIO_SYSCALL                 10
+#define BOOK3S_IRQPRIO_MACHINE_CHECK           11
+#define BOOK3S_IRQPRIO_DEBUG                   12
+#define BOOK3S_IRQPRIO_EXTERNAL                        13
+#define BOOK3S_IRQPRIO_DECREMENTER             14
+#define BOOK3S_IRQPRIO_PERFORMANCE_MONITOR     15
+#define BOOK3S_IRQPRIO_MAX                     16
+
+#define BOOK3S_HFLAG_DCBZ32                    0x1
+
 #define RESUME_FLAG_NV          (1<<0)  /* Reload guest nonvolatile state? */
 #define RESUME_FLAG_HOST        (1<<1)  /* Resume host? */
 
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
new file mode 100644 (file)
index 0000000..c601133
--- /dev/null
@@ -0,0 +1,136 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#ifndef __ASM_KVM_BOOK3S_H__
+#define __ASM_KVM_BOOK3S_H__
+
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <asm/kvm_ppc.h>
+
+struct kvmppc_slb {
+       u64 esid;
+       u64 vsid;
+       u64 orige;
+       u64 origv;
+       bool valid;
+       bool Ks;
+       bool Kp;
+       bool nx;
+       bool large;
+       bool class;
+};
+
+struct kvmppc_sr {
+       u32 raw;
+       u32 vsid;
+       bool Ks;
+       bool Kp;
+       bool nx;
+};
+
+struct kvmppc_bat {
+       u32 bepi;
+       u32 bepi_mask;
+       bool vs;
+       bool vp;
+       u32 brpn;
+       u8 wimg;
+       u8 pp;
+};
+
+struct kvmppc_sid_map {
+       u64 guest_vsid;
+       u64 guest_esid;
+       u64 host_vsid;
+       bool valid;
+};
+
+#define SID_MAP_BITS    9
+#define SID_MAP_NUM     (1 << SID_MAP_BITS)
+#define SID_MAP_MASK    (SID_MAP_NUM - 1)
+
+struct kvmppc_vcpu_book3s {
+       struct kvm_vcpu vcpu;
+       struct kvmppc_sid_map sid_map[SID_MAP_NUM];
+       struct kvmppc_slb slb[64];
+       struct {
+               u64 esid;
+               u64 vsid;
+       } slb_shadow[64];
+       u8 slb_shadow_max;
+       struct kvmppc_sr sr[16];
+       struct kvmppc_bat ibat[8];
+       struct kvmppc_bat dbat[8];
+       u64 hid[6];
+       int slb_nr;
+       u64 sdr1;
+       u64 dsisr;
+       u64 hior;
+       u64 msr_mask;
+       u64 vsid_first;
+       u64 vsid_next;
+       u64 vsid_max;
+       int context_id;
+};
+
+#define CONTEXT_HOST           0
+#define CONTEXT_GUEST          1
+#define CONTEXT_GUEST_END      2
+
+#define VSID_REAL      0xfffffffffff00000
+#define VSID_REAL_DR   0xffffffffffe00000
+#define VSID_REAL_IR   0xffffffffffd00000
+#define VSID_BAT       0xffffffffffc00000
+#define VSID_PR                0x8000000000000000
+
+extern void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, u64 ea, u64 ea_mask);
+extern void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 vp, u64 vp_mask);
+extern void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, u64 pa_start, u64 pa_end);
+extern void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 new_msr);
+extern void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu);
+extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu);
+extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
+extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
+extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
+extern struct kvmppc_pte *kvmppc_mmu_find_pte(struct kvm_vcpu *vcpu, u64 ea, bool data);
+extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong eaddr, int size, void *ptr, bool data);
+extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong eaddr, int size, void *ptr);
+extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
+
+extern u32 kvmppc_trampoline_lowmem;
+extern u32 kvmppc_trampoline_enter;
+
+static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
+{
+       return container_of(vcpu, struct kvmppc_vcpu_book3s, vcpu);
+}
+
+static inline ulong dsisr(void)
+{
+       ulong r;
+       asm ( "mfdsisr %0 " : "=r" (r) );
+       return r;
+}
+
+extern void kvm_return_point(void);
+
+#define INS_DCBZ                       0x7c0007ec
+
+#endif /* __ASM_KVM_BOOK3S_H__ */
diff --git a/arch/powerpc/include/asm/kvm_book3s_64_asm.h b/arch/powerpc/include/asm/kvm_book3s_64_asm.h
new file mode 100644 (file)
index 0000000..2e06ee8
--- /dev/null
@@ -0,0 +1,58 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#ifndef __ASM_KVM_BOOK3S_ASM_H__
+#define __ASM_KVM_BOOK3S_ASM_H__
+
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+
+#include <asm/kvm_asm.h>
+
+.macro DO_KVM intno
+       .if (\intno == BOOK3S_INTERRUPT_SYSTEM_RESET) || \
+           (\intno == BOOK3S_INTERRUPT_MACHINE_CHECK) || \
+           (\intno == BOOK3S_INTERRUPT_DATA_STORAGE) || \
+           (\intno == BOOK3S_INTERRUPT_INST_STORAGE) || \
+           (\intno == BOOK3S_INTERRUPT_DATA_SEGMENT) || \
+           (\intno == BOOK3S_INTERRUPT_INST_SEGMENT) || \
+           (\intno == BOOK3S_INTERRUPT_EXTERNAL) || \
+           (\intno == BOOK3S_INTERRUPT_ALIGNMENT) || \
+           (\intno == BOOK3S_INTERRUPT_PROGRAM) || \
+           (\intno == BOOK3S_INTERRUPT_FP_UNAVAIL) || \
+           (\intno == BOOK3S_INTERRUPT_DECREMENTER) || \
+           (\intno == BOOK3S_INTERRUPT_SYSCALL) || \
+           (\intno == BOOK3S_INTERRUPT_TRACE) || \
+           (\intno == BOOK3S_INTERRUPT_PERFMON) || \
+           (\intno == BOOK3S_INTERRUPT_ALTIVEC) || \
+           (\intno == BOOK3S_INTERRUPT_VSX)
+
+       b       kvmppc_trampoline_\intno
+kvmppc_resume_\intno:
+
+       .endif
+.endm
+
+#else
+
+.macro DO_KVM intno
+.endm
+
+#endif /* CONFIG_KVM_BOOK3S_64_HANDLER */
+
+#endif /* __ASM_KVM_BOOK3S_ASM_H__ */
index c9c930ed11d71fbe55ed8a742c1857a1efdaa7e4..1201f62d0d73f23e2e75e9b9622499ca259359c7 100644 (file)
@@ -21,7 +21,8 @@
 #define __POWERPC_KVM_HOST_H__
 
 #include <linux/mutex.h>
-#include <linux/timer.h>
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
 #include <linux/types.h>
 #include <linux/kvm_types.h>
 #include <asm/kvm_asm.h>
@@ -37,6 +38,8 @@
 #define KVM_NR_PAGE_SIZES      1
 #define KVM_PAGES_PER_HPAGE(x) (1UL<<31)
 
+#define HPTEG_CACHE_NUM 1024
+
 struct kvm;
 struct kvm_run;
 struct kvm_vcpu;
@@ -63,6 +66,17 @@ struct kvm_vcpu_stat {
        u32 dec_exits;
        u32 ext_intr_exits;
        u32 halt_wakeup;
+#ifdef CONFIG_PPC64
+       u32 pf_storage;
+       u32 pf_instruc;
+       u32 sp_storage;
+       u32 sp_instruc;
+       u32 queue_intr;
+       u32 ld;
+       u32 ld_slow;
+       u32 st;
+       u32 st_slow;
+#endif
 };
 
 enum kvm_exit_types {
@@ -109,9 +123,53 @@ struct kvmppc_exit_timing {
 struct kvm_arch {
 };
 
+struct kvmppc_pte {
+       u64 eaddr;
+       u64 vpage;
+       u64 raddr;
+       bool may_read;
+       bool may_write;
+       bool may_execute;
+};
+
+struct kvmppc_mmu {
+       /* book3s_64 only */
+       void (*slbmte)(struct kvm_vcpu *vcpu, u64 rb, u64 rs);
+       u64  (*slbmfee)(struct kvm_vcpu *vcpu, u64 slb_nr);
+       u64  (*slbmfev)(struct kvm_vcpu *vcpu, u64 slb_nr);
+       void (*slbie)(struct kvm_vcpu *vcpu, u64 slb_nr);
+       void (*slbia)(struct kvm_vcpu *vcpu);
+       /* book3s */
+       void (*mtsrin)(struct kvm_vcpu *vcpu, u32 srnum, ulong value);
+       u32  (*mfsrin)(struct kvm_vcpu *vcpu, u32 srnum);
+       int  (*xlate)(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_pte *pte, bool data);
+       void (*reset_msr)(struct kvm_vcpu *vcpu);
+       void (*tlbie)(struct kvm_vcpu *vcpu, ulong addr, bool large);
+       int  (*esid_to_vsid)(struct kvm_vcpu *vcpu, u64 esid, u64 *vsid);
+       u64  (*ea_to_vp)(struct kvm_vcpu *vcpu, gva_t eaddr, bool data);
+       bool (*is_dcbz32)(struct kvm_vcpu *vcpu);
+};
+
+struct hpte_cache {
+       u64 host_va;
+       u64 pfn;
+       ulong slot;
+       struct kvmppc_pte pte;
+};
+
 struct kvm_vcpu_arch {
-       u32 host_stack;
+       ulong host_stack;
        u32 host_pid;
+#ifdef CONFIG_PPC64
+       ulong host_msr;
+       ulong host_r2;
+       void *host_retip;
+       ulong trampoline_lowmem;
+       ulong trampoline_enter;
+       ulong highmem_handler;
+       ulong host_paca_phys;
+       struct kvmppc_mmu mmu;
+#endif
 
        u64 fpr[32];
        ulong gpr[32];
@@ -123,6 +181,10 @@ struct kvm_vcpu_arch {
        ulong xer;
 
        ulong msr;
+#ifdef CONFIG_PPC64
+       ulong shadow_msr;
+       ulong hflags;
+#endif
        u32 mmucr;
        ulong sprg0;
        ulong sprg1;
@@ -149,6 +211,7 @@ struct kvm_vcpu_arch {
        u32 ivor[64];
        ulong ivpr;
        u32 pir;
+       u32 pvr;
 
        u32 shadow_pid;
        u32 pid;
@@ -174,6 +237,9 @@ struct kvm_vcpu_arch {
 #endif
 
        u32 last_inst;
+#ifdef CONFIG_PPC64
+       ulong fault_dsisr;
+#endif
        ulong fault_dear;
        ulong fault_esr;
        gpa_t paddr_accessed;
@@ -185,8 +251,15 @@ struct kvm_vcpu_arch {
 
        u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
 
-       struct timer_list dec_timer;
+       struct hrtimer dec_timer;
+       struct tasklet_struct tasklet;
+       u64 dec_jiffies;
        unsigned long pending_exceptions;
+
+#ifdef CONFIG_PPC64
+       struct hpte_cache hpte_cache[HPTEG_CACHE_NUM];
+       int hpte_cache_offset;
+#endif
 };
 
 #endif /* __POWERPC_KVM_HOST_H__ */
index 2c6ee349df5e8cc9d5b286803a510a58b632ac9b..269ee46ab0285701f84411748a51e6f14c5b1018 100644 (file)
@@ -39,6 +39,7 @@ enum emulation_result {
 extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
 extern char kvmppc_handlers_start[];
 extern unsigned long kvmppc_handler_len;
+extern void kvmppc_handler_highmem(void);
 
 extern void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu);
 extern int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
index bebe31c2e9072bbd79144fbb616672b092e4d84e..7514ec2f854010c8c9900425b55a71edb8961db6 100644 (file)
@@ -173,14 +173,6 @@ extern unsigned long tce_alloc_start, tce_alloc_end;
  */
 extern int mmu_ci_restrictions;
 
-#ifdef CONFIG_HUGETLB_PAGE
-/*
- * The page size indexes of the huge pages for use by hugetlbfs
- */
-extern unsigned int mmu_huge_psizes[MMU_PAGE_COUNT];
-
-#endif /* CONFIG_HUGETLB_PAGE */
-
 /*
  * This function sets the AVPN and L fields of the HPTE  appropriately
  * for the page size
@@ -253,10 +245,11 @@ extern int __hash_page_64K(unsigned long ea, unsigned long access,
                           unsigned long vsid, pte_t *ptep, unsigned long trap,
                           unsigned int local, int ssize);
 struct mm_struct;
+unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap);
 extern int hash_page(unsigned long ea, unsigned long access, unsigned long trap);
-extern int hash_huge_page(struct mm_struct *mm, unsigned long access,
-                         unsigned long ea, unsigned long vsid, int local,
-                         unsigned long trap);
+int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
+                    pte_t *ptep, unsigned long trap, int local, int ssize,
+                    unsigned int shift, unsigned int mmu_psize);
 
 extern int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
                             unsigned long pstart, unsigned long prot,
index b34e94d9443583d1bd17729a1230e38adde53479..26383e0778aaaa143ad3aa793753204d5f8d7ca9 100644 (file)
@@ -23,6 +23,8 @@ extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm);
 extern void set_context(unsigned long id, pgd_t *pgd);
 
 #ifdef CONFIG_PPC_BOOK3S_64
+extern int __init_new_context(void);
+extern void __destroy_context(int context_id);
 static inline void mmu_context_init(void) { }
 #else
 extern void mmu_context_init(void);
index 6c587eddee59a3fd371dd83af6714d308c8d1aeb..850b72f27445ce7b0ec27ca12810182614e14fb3 100644 (file)
@@ -73,7 +73,6 @@ extern int nvram_write_error_log(char * buff, int length,
 extern int nvram_read_error_log(char * buff, int length,
                                         unsigned int * err_type, unsigned int *err_seq);
 extern int nvram_clear_error_log(void);
-extern struct nvram_partition *nvram_find_partition(int sig, const char *name);
 
 extern int pSeries_nvram_init(void);
 
index 7d8514ceceaeeb54314788ec2018fc375bace078..5e9b4ef71415c70b207769b8a9caa1ff486eea71 100644 (file)
@@ -129,6 +129,15 @@ struct paca_struct {
        u64 system_time;                /* accumulated system TB ticks */
        u64 startpurr;                  /* PURR/TB value snapshot */
        u64 startspurr;                 /* SPURR value snapshot */
+
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+       struct  {
+               u64     esid;
+               u64     vsid;
+       } kvm_slb[64];                  /* guest SLB */
+       u8 kvm_slb_max;                 /* highest used guest slb entry */
+       u8 kvm_in_guest;                /* are we inside the guest? */
+#endif
 };
 
 extern struct paca_struct paca[];
index ff24254990e11c4b64c35ee44f3ae664c36e8063..e96d52a516ba1bac8c0180a3eb75758bf55b919b 100644 (file)
@@ -229,6 +229,20 @@ typedef unsigned long pgprot_t;
 
 #endif
 
+typedef struct { signed long pd; } hugepd_t;
+#define HUGEPD_SHIFT_MASK     0x3f
+
+#ifdef CONFIG_HUGETLB_PAGE
+static inline int hugepd_ok(hugepd_t hpd)
+{
+       return (hpd.pd > 0);
+}
+
+#define is_hugepd(pdep)               (hugepd_ok(*((hugepd_t *)(pdep))))
+#else /* CONFIG_HUGETLB_PAGE */
+#define is_hugepd(pdep)                        0
+#endif /* CONFIG_HUGETLB_PAGE */
+
 struct page;
 extern void clear_user_page(void *page, unsigned long vaddr, struct page *pg);
 extern void copy_user_page(void *to, void *from, unsigned long vaddr,
index 3f17b83f55a18d8e4c8525136e257664414552f1..bfc4e027e2adf75640e99cd4c35555fc2834f3d8 100644 (file)
@@ -90,7 +90,7 @@ extern unsigned int HPAGE_SHIFT;
 #define HPAGE_SIZE             ((1UL) << HPAGE_SHIFT)
 #define HPAGE_MASK             (~(HPAGE_SIZE - 1))
 #define HUGETLB_PAGE_ORDER     (HPAGE_SHIFT - PAGE_SHIFT)
-#define HUGE_MAX_HSTATE                3
+#define HUGE_MAX_HSTATE                (MMU_PAGE_COUNT-1)
 
 #endif /* __ASSEMBLY__ */
 
index c9500d666a1de37687346d7b7cfae5dd2f81ac7a..580cf73b96e8bd0971aad0245cfc26bdf137ddf3 100644 (file)
@@ -3,7 +3,8 @@
 
 #include <linux/threads.h>
 
-#define PTE_NONCACHE_NUM       0  /* dummy for now to share code w/ppc64 */
+/* For 32-bit, all levels of page tables are just drawn from get_free_page() */
+#define MAX_PGTABLE_INDEX_SIZE 0
 
 extern void __bad_pte(pmd_t *pmd);
 
@@ -36,11 +37,10 @@ extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
 extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr);
 extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr);
 
-static inline void pgtable_free(pgtable_free_t pgf)
+static inline void pgtable_free(void *table, unsigned index_size)
 {
-       void *p = (void *)(pgf.val & ~PGF_CACHENUM_MASK);
-
-       free_page((unsigned long)p);
+       BUG_ON(index_size); /* 32-bit doesn't use this */
+       free_page((unsigned long)table);
 }
 
 #define check_pgt_cache()      do { } while (0)
index e6f069c4f713a7b2c7ce76e3da593693d2fc1d79..5c1cd73dafa839a3ca34d38bd69acb0a748d591b 100644 (file)
 #include <linux/cpumask.h>
 #include <linux/percpu.h>
 
+/*
+ * Functions that deal with pagetables that could be at any level of
+ * the table need to be passed an "index_size" so they know how to
+ * handle allocation.  For PTE pages (which are linked to a struct
+ * page for now, and drawn from the main get_free_pages() pool), the
+ * allocation size will be (2^index_size * sizeof(pointer)) and
+ * allocations are drawn from the kmem_cache in PGT_CACHE(index_size).
+ *
+ * The maximum index size needs to be big enough to allow any
+ * pagetable sizes we need, but small enough to fit in the low bits of
+ * any page table pointer.  In other words all pagetables, even tiny
+ * ones, must be aligned to allow at least enough low 0 bits to
+ * contain this value.  This value is also used as a mask, so it must
+ * be one less than a power of two.
+ */
+#define MAX_PGTABLE_INDEX_SIZE 0xf
+
 #ifndef CONFIG_PPC_SUBPAGE_PROT
 static inline void subpage_prot_free(pgd_t *pgd) {}
 #endif
 
 extern struct kmem_cache *pgtable_cache[];
-
-#define PGD_CACHE_NUM          0
-#define PUD_CACHE_NUM          1
-#define PMD_CACHE_NUM          1
-#define HUGEPTE_CACHE_NUM      2
-#define PTE_NONCACHE_NUM       7  /* from GFP rather than kmem_cache */
+#define PGT_CACHE(shift) (pgtable_cache[(shift)-1])
 
 static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-       return kmem_cache_alloc(pgtable_cache[PGD_CACHE_NUM], GFP_KERNEL);
+       return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), GFP_KERNEL);
 }
 
 static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
        subpage_prot_free(pgd);
-       kmem_cache_free(pgtable_cache[PGD_CACHE_NUM], pgd);
+       kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd);
 }
 
 #ifndef CONFIG_PPC_64K_PAGES
@@ -40,13 +52,13 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
-       return kmem_cache_alloc(pgtable_cache[PUD_CACHE_NUM],
+       return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE),
                                GFP_KERNEL|__GFP_REPEAT);
 }
 
 static inline void pud_free(struct mm_struct *mm, pud_t *pud)
 {
-       kmem_cache_free(pgtable_cache[PUD_CACHE_NUM], pud);
+       kmem_cache_free(PGT_CACHE(PUD_INDEX_SIZE), pud);
 }
 
 static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
@@ -78,13 +90,13 @@ static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
 
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
-       return kmem_cache_alloc(pgtable_cache[PMD_CACHE_NUM],
+       return kmem_cache_alloc(PGT_CACHE(PMD_INDEX_SIZE),
                                GFP_KERNEL|__GFP_REPEAT);
 }
 
 static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 {
-       kmem_cache_free(pgtable_cache[PMD_CACHE_NUM], pmd);
+       kmem_cache_free(PGT_CACHE(PMD_INDEX_SIZE), pmd);
 }
 
 static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
@@ -107,24 +119,22 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm,
        return page;
 }
 
-static inline void pgtable_free(pgtable_free_t pgf)
+static inline void pgtable_free(void *table, unsigned index_size)
 {
-       void *p = (void *)(pgf.val & ~PGF_CACHENUM_MASK);
-       int cachenum = pgf.val & PGF_CACHENUM_MASK;
-
-       if (cachenum == PTE_NONCACHE_NUM)
-               free_page((unsigned long)p);
-       else
-               kmem_cache_free(pgtable_cache[cachenum], p);
+       if (!index_size)
+               free_page((unsigned long)table);
+       else {
+               BUG_ON(index_size > MAX_PGTABLE_INDEX_SIZE);
+               kmem_cache_free(PGT_CACHE(index_size), table);
+       }
 }
 
-#define __pmd_free_tlb(tlb, pmd,addr)                \
-       pgtable_free_tlb(tlb, pgtable_free_cache(pmd, \
-               PMD_CACHE_NUM, PMD_TABLE_SIZE-1))
+#define __pmd_free_tlb(tlb, pmd, addr)               \
+       pgtable_free_tlb(tlb, pmd, PMD_INDEX_SIZE)
 #ifndef CONFIG_PPC_64K_PAGES
 #define __pud_free_tlb(tlb, pud, addr)               \
-       pgtable_free_tlb(tlb, pgtable_free_cache(pud, \
-               PUD_CACHE_NUM, PUD_TABLE_SIZE-1))
+       pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE)
+
 #endif /* CONFIG_PPC_64K_PAGES */
 
 #define check_pgt_cache()      do { } while (0)
index f2e812de7c3cb919f26074f79414a77271626429..abe8532bd14e4dfc27693cfadce3d49a78380881 100644 (file)
@@ -24,25 +24,6 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
        __free_page(ptepage);
 }
 
-typedef struct pgtable_free {
-       unsigned long val;
-} pgtable_free_t;
-
-/* This needs to be big enough to allow for MMU_PAGE_COUNT + 2 to be stored
- * and small enough to fit in the low bits of any naturally aligned page
- * table cache entry. Arbitrarily set to 0x1f, that should give us some
- * room to grow
- */
-#define PGF_CACHENUM_MASK      0x1f
-
-static inline pgtable_free_t pgtable_free_cache(void *p, int cachenum,
-                                               unsigned long mask)
-{
-       BUG_ON(cachenum > PGF_CACHENUM_MASK);
-
-       return (pgtable_free_t){.val = ((unsigned long) p & ~mask) | cachenum};
-}
-
 #ifdef CONFIG_PPC64
 #include <asm/pgalloc-64.h>
 #else
@@ -50,12 +31,12 @@ static inline pgtable_free_t pgtable_free_cache(void *p, int cachenum,
 #endif
 
 #ifdef CONFIG_SMP
-extern void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf);
+extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, unsigned shift);
 extern void pte_free_finish(void);
 #else /* CONFIG_SMP */
-static inline void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
+static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, unsigned shift)
 {
-       pgtable_free(pgf);
+       pgtable_free(table, shift);
 }
 static inline void pte_free_finish(void) { }
 #endif /* !CONFIG_SMP */
@@ -63,12 +44,9 @@ static inline void pte_free_finish(void) { }
 static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage,
                                  unsigned long address)
 {
-       pgtable_free_t pgf = pgtable_free_cache(page_address(ptepage),
-                                               PTE_NONCACHE_NUM,
-                                               PTE_TABLE_SIZE-1);
        tlb_flush_pgtable(tlb, address);
        pgtable_page_dtor(ptepage);
-       pgtable_free_tlb(tlb, pgf);
+       pgtable_free_tlb(tlb, page_address(ptepage), 0);
 }
 
 #endif /* __KERNEL__ */
index 806abe7a3fa58c32523a167f90ad18a0ef5f8aa0..49865045d56f2559377a62c3938cb9656f76f360 100644 (file)
@@ -354,6 +354,7 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
 #define pgoff_to_pte(off)      ((pte_t) {((off) << PTE_RPN_SHIFT)|_PAGE_FILE})
 #define PTE_FILE_MAX_BITS      (BITS_PER_LONG - PTE_RPN_SHIFT)
 
+void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
 void pgtable_cache_init(void);
 
 /*
@@ -378,7 +379,18 @@ void pgtable_cache_init(void);
        return pt;
 }
 
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long address);
+#ifdef CONFIG_HUGETLB_PAGE
+pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
+                                unsigned *shift);
+#else
+static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
+                                              unsigned *shift)
+{
+       if (shift)
+               *shift = 0;
+       return find_linux_pte(pgdir, ea);
+}
+#endif /* !CONFIG_HUGETLB_PAGE */
 
 #endif /* __ASSEMBLY__ */
 
index 2a5da069714e2e98e73ffbc4a8f7e20f84aa7a33..21207e54825b0566d5edf4c447c6d252ea76b497 100644 (file)
@@ -211,6 +211,9 @@ extern void paging_init(void);
  */
 extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t);
 
+extern int gup_hugepd(hugepd_t *hugepd, unsigned pdshift, unsigned long addr,
+                     unsigned long end, int write, struct page **pages, int *nr);
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
index b23664a0b86c89c7c87bde645acf3a338dc9e279..c002b0410219a341c8fdc880baa8c0acc45bd683 100644 (file)
@@ -42,10 +42,11 @@ obj-$(CONFIG_ALTIVEC)               += vecemu.o
 obj-$(CONFIG_PPC_970_NAP)      += idle_power4.o
 obj-$(CONFIG_PPC_OF)           += of_device.o of_platform.o prom_parse.o
 obj-$(CONFIG_PPC_CLOCK)                += clock.o
-procfs-$(CONFIG_PPC64)         := proc_ppc64.o
+procfs-y                       := proc_powerpc.o
 obj-$(CONFIG_PROC_FS)          += $(procfs-y)
 rtaspci-$(CONFIG_PPC64)-$(CONFIG_PCI)  := rtas_pci.o
 obj-$(CONFIG_PPC_RTAS)         += rtas.o rtas-rtc.o $(rtaspci-y-y)
+obj-$(CONFIG_PPC_RTAS_DAEMON)  += rtasd.o
 obj-$(CONFIG_RTAS_FLASH)       += rtas_flash.o
 obj-$(CONFIG_RTAS_PROC)                += rtas-proc.o
 obj-$(CONFIG_LPARCFG)          += lparcfg.o
index 0812b0f414bbe486f38db00ab1a171c4810410ef..e2e2082acf2983e96c9ed14a5dd1644125cb798f 100644 (file)
@@ -190,6 +190,11 @@ int main(void)
        DEFINE(PACA_SYSTEM_TIME, offsetof(struct paca_struct, system_time));
        DEFINE(PACA_DATA_OFFSET, offsetof(struct paca_struct, data_offset));
        DEFINE(PACA_TRAP_SAVE, offsetof(struct paca_struct, trap_save));
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+       DEFINE(PACA_KVM_IN_GUEST, offsetof(struct paca_struct, kvm_in_guest));
+       DEFINE(PACA_KVM_SLB, offsetof(struct paca_struct, kvm_slb));
+       DEFINE(PACA_KVM_SLB_MAX, offsetof(struct paca_struct, kvm_slb_max));
+#endif
 #endif /* CONFIG_PPC64 */
 
        /* RTAS */
@@ -398,6 +403,19 @@ int main(void)
        DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
        DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear));
        DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr));
+
+       /* book3s_64 */
+#ifdef CONFIG_PPC64
+       DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr));
+       DEFINE(VCPU_HOST_RETIP, offsetof(struct kvm_vcpu, arch.host_retip));
+       DEFINE(VCPU_HOST_R2, offsetof(struct kvm_vcpu, arch.host_r2));
+       DEFINE(VCPU_HOST_MSR, offsetof(struct kvm_vcpu, arch.host_msr));
+       DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr));
+       DEFINE(VCPU_TRAMPOLINE_LOWMEM, offsetof(struct kvm_vcpu, arch.trampoline_lowmem));
+       DEFINE(VCPU_TRAMPOLINE_ENTER, offsetof(struct kvm_vcpu, arch.trampoline_enter));
+       DEFINE(VCPU_HIGHMEM_HANDLER, offsetof(struct kvm_vcpu, arch.highmem_handler));
+       DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags));
+#endif
 #endif
 #ifdef CONFIG_44x
        DEFINE(PGD_T_LOG2, PGD_T_LOG2);
index 0a8439aafdd1ed26f78d61bc110703f32bd284c4..6f4613dd05ef05294e10f8cab83e243094f7f731 100644 (file)
@@ -373,7 +373,7 @@ void default_machine_crash_shutdown(struct pt_regs *regs)
        hard_irq_disable();
 
        for_each_irq(i) {
-               struct irq_desc *desc = irq_desc + i;
+               struct irq_desc *desc = irq_to_desc(i);
 
                if (desc->status & IRQ_INPROGRESS)
                        desc->chip->eoi(i);
index 1808876edcc91d4f43fdff8fde2c8e1770fc832f..fc3ead066cec8c8ba61e1312785e3aabfc98d5a4 100644 (file)
@@ -41,6 +41,7 @@ __start_interrupts:
        . = 0x200
 _machine_check_pSeries:
        HMT_MEDIUM
+       DO_KVM  0x200
        mtspr   SPRN_SPRG_SCRATCH0,r13          /* save r13 */
        EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common)
 
@@ -48,6 +49,7 @@ _machine_check_pSeries:
        .globl data_access_pSeries
 data_access_pSeries:
        HMT_MEDIUM
+       DO_KVM  0x300
        mtspr   SPRN_SPRG_SCRATCH0,r13
 BEGIN_FTR_SECTION
        mfspr   r13,SPRN_SPRG_PACA
@@ -77,6 +79,7 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_SLB)
        .globl data_access_slb_pSeries
 data_access_slb_pSeries:
        HMT_MEDIUM
+       DO_KVM  0x380
        mtspr   SPRN_SPRG_SCRATCH0,r13
        mfspr   r13,SPRN_SPRG_PACA              /* get paca address into r13 */
        std     r3,PACA_EXSLB+EX_R3(r13)
@@ -115,6 +118,7 @@ data_access_slb_pSeries:
        .globl instruction_access_slb_pSeries
 instruction_access_slb_pSeries:
        HMT_MEDIUM
+       DO_KVM  0x480
        mtspr   SPRN_SPRG_SCRATCH0,r13
        mfspr   r13,SPRN_SPRG_PACA              /* get paca address into r13 */
        std     r3,PACA_EXSLB+EX_R3(r13)
@@ -154,6 +158,7 @@ instruction_access_slb_pSeries:
        .globl  system_call_pSeries
 system_call_pSeries:
        HMT_MEDIUM
+       DO_KVM  0xc00
 BEGIN_FTR_SECTION
        cmpdi   r0,0x1ebe
        beq-    1f
@@ -186,12 +191,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)
         * trickery is thus necessary
         */
        . = 0xf00
+       DO_KVM  0xf00
        b       performance_monitor_pSeries
 
        . = 0xf20
+       DO_KVM  0xf20
        b       altivec_unavailable_pSeries
 
        . = 0xf40
+       DO_KVM  0xf40
        b       vsx_unavailable_pSeries
 
 #ifdef CONFIG_CBE_RAS
index c38afdb45d7b066d2720d69a484f4c5a38ed71c0..9258074880221d48af37c37d1e8db69b3e378b47 100644 (file)
@@ -37,6 +37,7 @@
 #include <asm/firmware.h>
 #include <asm/page_64.h>
 #include <asm/irqflags.h>
+#include <asm/kvm_book3s_64_asm.h>
 
 /* The physical memory is layed out such that the secondary processor
  * spin code sits at 0x0000...0x00ff. On server, the vectors follow
@@ -165,6 +166,12 @@ exception_marker:
 #include "exceptions-64s.S"
 #endif
 
+/* KVM trampoline code needs to be close to the interrupt handlers */
+
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#include "../kvm/book3s_64_rmhandlers.S"
+#endif
+
 _GLOBAL(generic_secondary_thread_init)
        mr      r24,r3
 
index e5d1211779840f029841e1591952c3b5ec9b4fae..eba53923630f22da6f71b173679a12eac6f478a6 100644 (file)
@@ -85,7 +85,10 @@ extern int tau_interrupts(int);
 #endif /* CONFIG_PPC32 */
 
 #ifdef CONFIG_PPC64
+
+#ifndef CONFIG_SPARSE_IRQ
 EXPORT_SYMBOL(irq_desc);
+#endif
 
 int distribute_irqs = 1;
 
@@ -187,33 +190,7 @@ int show_interrupts(struct seq_file *p, void *v)
                for_each_online_cpu(j)
                        seq_printf(p, "CPU%d       ", j);
                seq_putc(p, '\n');
-       }
-
-       if (i < NR_IRQS) {
-               desc = get_irq_desc(i);
-               spin_lock_irqsave(&desc->lock, flags);
-               action = desc->action;
-               if (!action || !action->handler)
-                       goto skip;
-               seq_printf(p, "%3d: ", i);
-#ifdef CONFIG_SMP
-               for_each_online_cpu(j)
-                       seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
-#else
-               seq_printf(p, "%10u ", kstat_irqs(i));
-#endif /* CONFIG_SMP */
-               if (desc->chip)
-                       seq_printf(p, " %s ", desc->chip->typename);
-               else
-                       seq_puts(p, "  None      ");
-               seq_printf(p, "%s", (desc->status & IRQ_LEVEL) ? "Level " : "Edge  ");
-               seq_printf(p, "    %s", action->name);
-               for (action = action->next; action; action = action->next)
-                       seq_printf(p, ", %s", action->name);
-               seq_putc(p, '\n');
-skip:
-               spin_unlock_irqrestore(&desc->lock, flags);
-       } else if (i == NR_IRQS) {
+       } else if (i == nr_irqs) {
 #if defined(CONFIG_PPC32) && defined(CONFIG_TAU_INT)
                if (tau_initialized){
                        seq_puts(p, "TAU: ");
@@ -223,30 +200,68 @@ skip:
                }
 #endif /* CONFIG_PPC32 && CONFIG_TAU_INT*/
                seq_printf(p, "BAD: %10u\n", ppc_spurious_interrupts);
+
+               return 0;
        }
+
+       desc = irq_to_desc(i);
+       if (!desc)
+               return 0;
+
+       spin_lock_irqsave(&desc->lock, flags);
+
+       action = desc->action;
+       if (!action || !action->handler)
+               goto skip;
+
+       seq_printf(p, "%3d: ", i);
+#ifdef CONFIG_SMP
+       for_each_online_cpu(j)
+               seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
+#else
+       seq_printf(p, "%10u ", kstat_irqs(i));
+#endif /* CONFIG_SMP */
+
+       if (desc->chip)
+               seq_printf(p, " %s ", desc->chip->typename);
+       else
+               seq_puts(p, "  None      ");
+
+       seq_printf(p, "%s", (desc->status & IRQ_LEVEL) ? "Level " : "Edge  ");
+       seq_printf(p, "    %s", action->name);
+
+       for (action = action->next; action; action = action->next)
+               seq_printf(p, ", %s", action->name);
+       seq_putc(p, '\n');
+
+skip:
+       spin_unlock_irqrestore(&desc->lock, flags);
+
        return 0;
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
 void fixup_irqs(cpumask_t map)
 {
+       struct irq_desc *desc;
        unsigned int irq;
        static int warned;
 
        for_each_irq(irq) {
                cpumask_t mask;
 
-               if (irq_desc[irq].status & IRQ_PER_CPU)
+               desc = irq_to_desc(irq);
+               if (desc && desc->status & IRQ_PER_CPU)
                        continue;
 
-               cpumask_and(&mask, irq_desc[irq].affinity, &map);
+               cpumask_and(&mask, desc->affinity, &map);
                if (any_online_cpu(mask) == NR_CPUS) {
                        printk("Breaking affinity for irq %i\n", irq);
                        mask = map;
                }
-               if (irq_desc[irq].chip->set_affinity)
-                       irq_desc[irq].chip->set_affinity(irq, &mask);
-               else if (irq_desc[irq].action && !(warned++))
+               if (desc->chip->set_affinity)
+                       desc->chip->set_affinity(irq, &mask);
+               else if (desc->action && !(warned++))
                        printk("Cannot set affinity for irq %i\n", irq);
        }
 
@@ -273,7 +288,7 @@ static inline void handle_one_irq(unsigned int irq)
                return;
        }
 
-       desc = irq_desc + irq;
+       desc = irq_to_desc(irq);
        saved_sp_limit = current->thread.ksp_limit;
 
        irqtp->task = curtp->task;
@@ -535,7 +550,7 @@ struct irq_host *irq_alloc_host(struct device_node *of_node,
                        smp_wmb();
 
                        /* Clear norequest flags */
-                       get_irq_desc(i)->status &= ~IRQ_NOREQUEST;
+                       irq_to_desc(i)->status &= ~IRQ_NOREQUEST;
 
                        /* Legacy flags are left to default at this point,
                         * one can then use irq_create_mapping() to
@@ -601,8 +616,16 @@ void irq_set_virq_count(unsigned int count)
 static int irq_setup_virq(struct irq_host *host, unsigned int virq,
                            irq_hw_number_t hwirq)
 {
+       struct irq_desc *desc;
+
+       desc = irq_to_desc_alloc_node(virq, 0);
+       if (!desc) {
+               pr_debug("irq: -> allocating desc failed\n");
+               goto error;
+       }
+
        /* Clear IRQ_NOREQUEST flag */
-       get_irq_desc(virq)->status &= ~IRQ_NOREQUEST;
+       desc->status &= ~IRQ_NOREQUEST;
 
        /* map it */
        smp_wmb();
@@ -611,11 +634,14 @@ static int irq_setup_virq(struct irq_host *host, unsigned int virq,
 
        if (host->ops->map(host, virq, hwirq)) {
                pr_debug("irq: -> mapping failed, freeing\n");
-               irq_free_virt(virq, 1);
-               return -1;
+               goto error;
        }
 
        return 0;
+
+error:
+       irq_free_virt(virq, 1);
+       return -1;
 }
 
 unsigned int irq_create_direct_mapping(struct irq_host *host)
@@ -732,7 +758,7 @@ unsigned int irq_create_of_mapping(struct device_node *controller,
 
        /* Set type if specified and different than the current one */
        if (type != IRQ_TYPE_NONE &&
-           type != (get_irq_desc(virq)->status & IRQF_TRIGGER_MASK))
+           type != (irq_to_desc(virq)->status & IRQF_TRIGGER_MASK))
                set_irq_type(virq, type);
        return virq;
 }
@@ -804,7 +830,7 @@ void irq_dispose_mapping(unsigned int virq)
        irq_map[virq].hwirq = host->inval_irq;
 
        /* Set some flags */
-       get_irq_desc(virq)->status |= IRQ_NOREQUEST;
+       irq_to_desc(virq)->status |= IRQ_NOREQUEST;
 
        /* Free it */
        irq_free_virt(virq, 1);
@@ -996,12 +1022,24 @@ void irq_free_virt(unsigned int virq, unsigned int count)
        spin_unlock_irqrestore(&irq_big_lock, flags);
 }
 
-void irq_early_init(void)
+int arch_early_irq_init(void)
 {
-       unsigned int i;
+       struct irq_desc *desc;
+       int i;
+
+       for (i = 0; i < NR_IRQS; i++) {
+               desc = irq_to_desc(i);
+               if (desc)
+                       desc->status |= IRQ_NOREQUEST;
+       }
 
-       for (i = 0; i < NR_IRQS; i++)
-               get_irq_desc(i)->status |= IRQ_NOREQUEST;
+       return 0;
+}
+
+int arch_init_chip_data(struct irq_desc *desc, int node)
+{
+       desc->status |= IRQ_NOREQUEST;
+       return 0;
 }
 
 /* We need to create the radix trees late */
@@ -1063,8 +1101,11 @@ static int virq_debug_show(struct seq_file *m, void *private)
        seq_printf(m, "%-5s  %-7s  %-15s  %s\n", "virq", "hwirq",
                      "chip name", "host name");
 
-       for (i = 1; i < NR_IRQS; i++) {
-               desc = get_irq_desc(i);
+       for (i = 1; i < nr_irqs; i++) {
+               desc = irq_to_desc(i);
+               if (!desc)
+                       continue;
+
                spin_lock_irqsave(&desc->lock, flags);
 
                if (desc->action && desc->action->handler) {
index ed0ac4e4b8d8c82639625d67c8619d495bd9c9d2..79a00bb9c64ca86c77d1af2b8889683d710e434d 100644 (file)
@@ -781,9 +781,9 @@ static int __init lparcfg_init(void)
                        !firmware_has_feature(FW_FEATURE_ISERIES))
                mode |= S_IWUSR;
 
-       ent = proc_create("ppc64/lparcfg", mode, NULL, &lparcfg_fops);
+       ent = proc_create("powerpc/lparcfg", mode, NULL, &lparcfg_fops);
        if (!ent) {
-               printk(KERN_ERR "Failed to create ppc64/lparcfg\n");
+               printk(KERN_ERR "Failed to create powerpc/lparcfg\n");
                return -EIO;
        }
 
index 0ed31f2204826e2fa68315bd5bfe7be3cf3b10ba..a8f9251f4cac4d294a06d0e44f6fd71265e83e8f 100644 (file)
@@ -184,7 +184,7 @@ static struct miscdevice nvram_dev = {
 
 
 #ifdef DEBUG_NVRAM
-static void nvram_print_partitions(char * label)
+static void __init nvram_print_partitions(char * label)
 {
        struct list_head * p;
        struct nvram_partition * tmp_part;
@@ -202,7 +202,7 @@ static void nvram_print_partitions(char * label)
 #endif
 
 
-static int nvram_write_header(struct nvram_partition * part)
+static int __init nvram_write_header(struct nvram_partition * part)
 {
        loff_t tmp_index;
        int rc;
@@ -214,7 +214,7 @@ static int nvram_write_header(struct nvram_partition * part)
 }
 
 
-static unsigned char nvram_checksum(struct nvram_header *p)
+static unsigned char __init nvram_checksum(struct nvram_header *p)
 {
        unsigned int c_sum, c_sum2;
        unsigned short *sp = (unsigned short *)p->name; /* assume 6 shorts */
@@ -228,32 +228,7 @@ static unsigned char nvram_checksum(struct nvram_header *p)
        return c_sum;
 }
 
-
-/*
- * Find an nvram partition, sig can be 0 for any
- * partition or name can be NULL for any name, else
- * tries to match both
- */
-struct nvram_partition *nvram_find_partition(int sig, const char *name)
-{
-       struct nvram_partition * part;
-       struct list_head * p;
-
-       list_for_each(p, &nvram_part->partition) {
-               part = list_entry(p, struct nvram_partition, partition);
-
-               if (sig && part->header.signature != sig)
-                       continue;
-               if (name && 0 != strncmp(name, part->header.name, 12))
-                       continue;
-               return part; 
-       }
-       return NULL;
-}
-EXPORT_SYMBOL(nvram_find_partition);
-
-
-static int nvram_remove_os_partition(void)
+static int __init nvram_remove_os_partition(void)
 {
        struct list_head *i;
        struct list_head *j;
@@ -319,7 +294,7 @@ static int nvram_remove_os_partition(void)
  * Will create a partition starting at the first free
  * space found if space has enough room.
  */
-static int nvram_create_os_partition(void)
+static int __init nvram_create_os_partition(void)
 {
        struct nvram_partition *part;
        struct nvram_partition *new_part;
@@ -422,7 +397,7 @@ static int nvram_create_os_partition(void)
  * 5.) If the max chunk cannot be allocated then try finding a chunk
  * that will satisfy the minum needed (NVRAM_MIN_REQ).
  */
-static int nvram_setup_partition(void)
+static int __init nvram_setup_partition(void)
 {
        struct list_head * p;
        struct nvram_partition * part;
@@ -480,7 +455,7 @@ static int nvram_setup_partition(void)
 }
 
 
-static int nvram_scan_partitions(void)
+static int __init nvram_scan_partitions(void)
 {
        loff_t cur_index = 0;
        struct nvram_header phead;
@@ -706,6 +681,9 @@ int nvram_clear_error_log(void)
        int clear_word = ERR_FLAG_ALREADY_LOGGED;
        int rc;
 
+       if (nvram_error_log_index == -1)
+               return -1;
+
        tmp_index = nvram_error_log_index;
        
        rc = ppc_md.nvram_write((char *)&clear_word, sizeof(int), &tmp_index);
index 0a03cf70d24793a9d0295acd4dfaf8abf0b0dd59..936f04dbfc6f00bd1803a643753520b34e1ae8b1 100644 (file)
@@ -119,13 +119,6 @@ static void perf_callchain_kernel(struct pt_regs *regs,
 }
 
 #ifdef CONFIG_PPC64
-
-#ifdef CONFIG_HUGETLB_PAGE
-#define is_huge_psize(pagesize)        (HPAGE_SHIFT && mmu_huge_psizes[pagesize])
-#else
-#define is_huge_psize(pagesize)        0
-#endif
-
 /*
  * On 64-bit we don't want to invoke hash_page on user addresses from
  * interrupt context, so if the access faults, we read the page tables
@@ -135,7 +128,7 @@ static int read_user_stack_slow(void __user *ptr, void *ret, int nb)
 {
        pgd_t *pgdir;
        pte_t *ptep, pte;
-       int pagesize;
+       unsigned shift;
        unsigned long addr = (unsigned long) ptr;
        unsigned long offset;
        unsigned long pfn;
@@ -145,17 +138,14 @@ static int read_user_stack_slow(void __user *ptr, void *ret, int nb)
        if (!pgdir)
                return -EFAULT;
 
-       pagesize = get_slice_psize(current->mm, addr);
+       ptep = find_linux_pte_or_hugepte(pgdir, addr, &shift);
+       if (!shift)
+               shift = PAGE_SHIFT;
 
        /* align address to page boundary */
-       offset = addr & ((1ul << mmu_psize_defs[pagesize].shift) - 1);
+       offset = addr & ((1UL << shift) - 1);
        addr -= offset;
 
-       if (is_huge_psize(pagesize))
-               ptep = huge_pte_offset(current->mm, addr);
-       else
-               ptep = find_linux_pte(pgdir, addr);
-
        if (ptep == NULL)
                return -EFAULT;
        pte = *ptep;
index c8b27bb4dbdec45eba0de471c106728bb892fdee..07115d6cd4ba7dd778d2a71a071bb63f4c726beb 100644 (file)
@@ -162,7 +162,6 @@ EXPORT_SYMBOL(screen_info);
 
 #ifdef CONFIG_PPC32
 EXPORT_SYMBOL(timer_interrupt);
-EXPORT_SYMBOL(irq_desc);
 EXPORT_SYMBOL(tb_ticks_per_jiffy);
 EXPORT_SYMBOL(cacheable_memcpy);
 EXPORT_SYMBOL(cacheable_memzero);
diff --git a/arch/powerpc/kernel/proc_powerpc.c b/arch/powerpc/kernel/proc_powerpc.c
new file mode 100644 (file)
index 0000000..1ed3b8d
--- /dev/null
@@ -0,0 +1,126 @@
+/*
+ * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+
+#include <asm/machdep.h>
+#include <asm/vdso_datapage.h>
+#include <asm/rtas.h>
+#include <asm/uaccess.h>
+#include <asm/prom.h>
+
+#ifdef CONFIG_PPC64
+
+static loff_t page_map_seek( struct file *file, loff_t off, int whence)
+{
+       loff_t new;
+       struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode);
+
+       switch(whence) {
+       case 0:
+               new = off;
+               break;
+       case 1:
+               new = file->f_pos + off;
+               break;
+       case 2:
+               new = dp->size + off;
+               break;
+       default:
+               return -EINVAL;
+       }
+       if ( new < 0 || new > dp->size )
+               return -EINVAL;
+       return (file->f_pos = new);
+}
+
+static ssize_t page_map_read( struct file *file, char __user *buf, size_t nbytes,
+                             loff_t *ppos)
+{
+       struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode);
+       return simple_read_from_buffer(buf, nbytes, ppos, dp->data, dp->size);
+}
+
+static int page_map_mmap( struct file *file, struct vm_area_struct *vma )
+{
+       struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode);
+
+       if ((vma->vm_end - vma->vm_start) > dp->size)
+               return -EINVAL;
+
+       remap_pfn_range(vma, vma->vm_start, __pa(dp->data) >> PAGE_SHIFT,
+                                               dp->size, vma->vm_page_prot);
+       return 0;
+}
+
+static const struct file_operations page_map_fops = {
+       .llseek = page_map_seek,
+       .read   = page_map_read,
+       .mmap   = page_map_mmap
+};
+
+
+static int __init proc_ppc64_init(void)
+{
+       struct proc_dir_entry *pde;
+
+       pde = proc_create_data("powerpc/systemcfg", S_IFREG|S_IRUGO, NULL,
+                              &page_map_fops, vdso_data);
+       if (!pde)
+               return 1;
+       pde->size = PAGE_SIZE;
+
+       return 0;
+}
+__initcall(proc_ppc64_init);
+
+#endif /* CONFIG_PPC64 */
+
+/*
+ * Create the ppc64 and ppc64/rtas directories early. This allows us to
+ * assume that they have been previously created in drivers.
+ */
+static int __init proc_ppc64_create(void)
+{
+       struct proc_dir_entry *root;
+
+       root = proc_mkdir("powerpc", NULL);
+       if (!root)
+               return 1;
+
+#ifdef CONFIG_PPC64
+       if (!proc_symlink("ppc64", NULL, "powerpc"))
+               pr_err("Failed to create link /proc/ppc64 -> /proc/powerpc\n");
+#endif
+
+       if (!of_find_node_by_path("/rtas"))
+               return 0;
+
+       if (!proc_mkdir("rtas", root))
+               return 1;
+
+       if (!proc_symlink("rtas", NULL, "powerpc/rtas"))
+               return 1;
+
+       return 0;
+}
+core_initcall(proc_ppc64_create);
diff --git a/arch/powerpc/kernel/proc_ppc64.c b/arch/powerpc/kernel/proc_ppc64.c
deleted file mode 100644 (file)
index c647dde..0000000
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- */
-
-#include <linux/init.h>
-#include <linux/mm.h>
-#include <linux/proc_fs.h>
-#include <linux/slab.h>
-#include <linux/kernel.h>
-
-#include <asm/machdep.h>
-#include <asm/vdso_datapage.h>
-#include <asm/rtas.h>
-#include <asm/uaccess.h>
-#include <asm/prom.h>
-
-static loff_t  page_map_seek( struct file *file, loff_t off, int whence);
-static ssize_t page_map_read( struct file *file, char __user *buf, size_t nbytes,
-                             loff_t *ppos);
-static int     page_map_mmap( struct file *file, struct vm_area_struct *vma );
-
-static const struct file_operations page_map_fops = {
-       .llseek = page_map_seek,
-       .read   = page_map_read,
-       .mmap   = page_map_mmap
-};
-
-/*
- * Create the ppc64 and ppc64/rtas directories early. This allows us to
- * assume that they have been previously created in drivers.
- */
-static int __init proc_ppc64_create(void)
-{
-       struct proc_dir_entry *root;
-
-       root = proc_mkdir("ppc64", NULL);
-       if (!root)
-               return 1;
-
-       if (!of_find_node_by_path("/rtas"))
-               return 0;
-
-       if (!proc_mkdir("rtas", root))
-               return 1;
-
-       if (!proc_symlink("rtas", NULL, "ppc64/rtas"))
-               return 1;
-
-       return 0;
-}
-core_initcall(proc_ppc64_create);
-
-static int __init proc_ppc64_init(void)
-{
-       struct proc_dir_entry *pde;
-
-       pde = proc_create_data("ppc64/systemcfg", S_IFREG|S_IRUGO, NULL,
-                              &page_map_fops, vdso_data);
-       if (!pde)
-               return 1;
-       pde->size = PAGE_SIZE;
-
-       return 0;
-}
-__initcall(proc_ppc64_init);
-
-static loff_t page_map_seek( struct file *file, loff_t off, int whence)
-{
-       loff_t new;
-       struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode);
-
-       switch(whence) {
-       case 0:
-               new = off;
-               break;
-       case 1:
-               new = file->f_pos + off;
-               break;
-       case 2:
-               new = dp->size + off;
-               break;
-       default:
-               return -EINVAL;
-       }
-       if ( new < 0 || new > dp->size )
-               return -EINVAL;
-       return (file->f_pos = new);
-}
-
-static ssize_t page_map_read( struct file *file, char __user *buf, size_t nbytes,
-                             loff_t *ppos)
-{
-       struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode);
-       return simple_read_from_buffer(buf, nbytes, ppos, dp->data, dp->size);
-}
-
-static int page_map_mmap( struct file *file, struct vm_area_struct *vma )
-{
-       struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode);
-
-       if ((vma->vm_end - vma->vm_start) > dp->size)
-               return -EINVAL;
-
-       remap_pfn_range(vma, vma->vm_start, __pa(dp->data) >> PAGE_SHIFT,
-                                               dp->size, vma->vm_page_prot);
-       return 0;
-}
-
index 13011a96a9779b274e77447dd5b72dae4e56142a..a85117d5c9a4c3576ac60d0b774df2c43ca4f9fc 100644 (file)
@@ -6,7 +6,7 @@
  *      as published by the Free Software Foundation; either version
  *      2 of the License, or (at your option) any later version.
  *
- * /proc/ppc64/rtas/firmware_flash interface
+ * /proc/powerpc/rtas/firmware_flash interface
  *
  * This file implements a firmware_flash interface to pump a firmware
  * image into the kernel.  At reboot time rtas_restart() will see the
@@ -740,7 +740,7 @@ static int __init rtas_flash_init(void)
                return 1;
        }
 
-       firmware_flash_pde = create_flash_pde("ppc64/rtas/"
+       firmware_flash_pde = create_flash_pde("powerpc/rtas/"
                                              FIRMWARE_FLASH_NAME,
                                              &rtas_flash_operations);
        if (firmware_flash_pde == NULL) {
@@ -754,7 +754,7 @@ static int __init rtas_flash_init(void)
        if (rc != 0)
                goto cleanup;
 
-       firmware_update_pde = create_flash_pde("ppc64/rtas/"
+       firmware_update_pde = create_flash_pde("powerpc/rtas/"
                                               FIRMWARE_UPDATE_NAME,
                                               &rtas_flash_operations);
        if (firmware_update_pde == NULL) {
@@ -768,7 +768,7 @@ static int __init rtas_flash_init(void)
        if (rc != 0)
                goto cleanup;
 
-       validate_pde = create_flash_pde("ppc64/rtas/" VALIDATE_FLASH_NAME,
+       validate_pde = create_flash_pde("powerpc/rtas/" VALIDATE_FLASH_NAME,
                                        &validate_flash_operations);
        if (validate_pde == NULL) {
                rc = -ENOMEM;
@@ -781,7 +781,7 @@ static int __init rtas_flash_init(void)
        if (rc != 0)
                goto cleanup;
 
-       manage_pde = create_flash_pde("ppc64/rtas/" MANAGE_FLASH_NAME,
+       manage_pde = create_flash_pde("powerpc/rtas/" MANAGE_FLASH_NAME,
                                      &manage_flash_operations);
        if (manage_pde == NULL) {
                rc = -ENOMEM;
diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c
new file mode 100644 (file)
index 0000000..2e4832a
--- /dev/null
@@ -0,0 +1,539 @@
+/*
+ * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Communication to userspace based on kernel/printk.c
+ */
+
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/poll.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/vmalloc.h>
+#include <linux/spinlock.h>
+#include <linux/cpu.h>
+#include <linux/workqueue.h>
+
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/rtas.h>
+#include <asm/prom.h>
+#include <asm/nvram.h>
+#include <asm/atomic.h>
+#include <asm/machdep.h>
+
+
+static DEFINE_SPINLOCK(rtasd_log_lock);
+
+static DECLARE_WAIT_QUEUE_HEAD(rtas_log_wait);
+
+static char *rtas_log_buf;
+static unsigned long rtas_log_start;
+static unsigned long rtas_log_size;
+
+static int surveillance_timeout = -1;
+
+static unsigned int rtas_error_log_max;
+static unsigned int rtas_error_log_buffer_max;
+
+/* RTAS service tokens */
+static unsigned int event_scan;
+static unsigned int rtas_event_scan_rate;
+
+static int full_rtas_msgs = 0;
+
+/* Stop logging to nvram after first fatal error */
+static int logging_enabled; /* Until we initialize everything,
+                             * make sure we don't try logging
+                             * anything */
+static int error_log_cnt;
+
+/*
+ * Since we use 32 bit RTAS, the physical address of this must be below
+ * 4G or else bad things happen. Allocate this in the kernel data and
+ * make it big enough.
+ */
+static unsigned char logdata[RTAS_ERROR_LOG_MAX];
+
+static char *rtas_type[] = {
+       "Unknown", "Retry", "TCE Error", "Internal Device Failure",
+       "Timeout", "Data Parity", "Address Parity", "Cache Parity",
+       "Address Invalid", "ECC Uncorrected", "ECC Corrupted",
+};
+
+static char *rtas_event_type(int type)
+{
+       if ((type > 0) && (type < 11))
+               return rtas_type[type];
+
+       switch (type) {
+               case RTAS_TYPE_EPOW:
+                       return "EPOW";
+               case RTAS_TYPE_PLATFORM:
+                       return "Platform Error";
+               case RTAS_TYPE_IO:
+                       return "I/O Event";
+               case RTAS_TYPE_INFO:
+                       return "Platform Information Event";
+               case RTAS_TYPE_DEALLOC:
+                       return "Resource Deallocation Event";
+               case RTAS_TYPE_DUMP:
+                       return "Dump Notification Event";
+       }
+
+       return rtas_type[0];
+}
+
+/* To see this info, grep RTAS /var/log/messages and each entry
+ * will be collected together with obvious begin/end.
+ * There will be a unique identifier on the begin and end lines.
+ * This will persist across reboots.
+ *
+ * format of error logs returned from RTAS:
+ * bytes       (size)  : contents
+ * --------------------------------------------------------
+ * 0-7         (8)     : rtas_error_log
+ * 8-47                (40)    : extended info
+ * 48-51       (4)     : vendor id
+ * 52-1023 (vendor specific) : location code and debug data
+ */
+static void printk_log_rtas(char *buf, int len)
+{
+
+       int i,j,n = 0;
+       int perline = 16;
+       char buffer[64];
+       char * str = "RTAS event";
+
+       if (full_rtas_msgs) {
+               printk(RTAS_DEBUG "%d -------- %s begin --------\n",
+                      error_log_cnt, str);
+
+               /*
+                * Print perline bytes on each line, each line will start
+                * with RTAS and a changing number, so syslogd will
+                * print lines that are otherwise the same.  Separate every
+                * 4 bytes with a space.
+                */
+               for (i = 0; i < len; i++) {
+                       j = i % perline;
+                       if (j == 0) {
+                               memset(buffer, 0, sizeof(buffer));
+                               n = sprintf(buffer, "RTAS %d:", i/perline);
+                       }
+
+                       if ((i % 4) == 0)
+                               n += sprintf(buffer+n, " ");
+
+                       n += sprintf(buffer+n, "%02x", (unsigned char)buf[i]);
+
+                       if (j == (perline-1))
+                               printk(KERN_DEBUG "%s\n", buffer);
+               }
+               if ((i % perline) != 0)
+                       printk(KERN_DEBUG "%s\n", buffer);
+
+               printk(RTAS_DEBUG "%d -------- %s end ----------\n",
+                      error_log_cnt, str);
+       } else {
+               struct rtas_error_log *errlog = (struct rtas_error_log *)buf;
+
+               printk(RTAS_DEBUG "event: %d, Type: %s, Severity: %d\n",
+                      error_log_cnt, rtas_event_type(errlog->type),
+                      errlog->severity);
+       }
+}
+
+static int log_rtas_len(char * buf)
+{
+       int len;
+       struct rtas_error_log *err;
+
+       /* rtas fixed header */
+       len = 8;
+       err = (struct rtas_error_log *)buf;
+       if (err->extended_log_length) {
+
+               /* extended header */
+               len += err->extended_log_length;
+       }
+
+       if (rtas_error_log_max == 0)
+               rtas_error_log_max = rtas_get_error_log_max();
+
+       if (len > rtas_error_log_max)
+               len = rtas_error_log_max;
+
+       return len;
+}
+
+/*
+ * First write to nvram, if fatal error, that is the only
+ * place we log the info.  The error will be picked up
+ * on the next reboot by rtasd.  If not fatal, run the
+ * method for the type of error.  Currently, only RTAS
+ * errors have methods implemented, but in the future
+ * there might be a need to store data in nvram before a
+ * call to panic().
+ *
+ * XXX We write to nvram periodically, to indicate error has
+ * been written and sync'd, but there is a possibility
+ * that if we don't shutdown correctly, a duplicate error
+ * record will be created on next reboot.
+ */
+void pSeries_log_error(char *buf, unsigned int err_type, int fatal)
+{
+       unsigned long offset;
+       unsigned long s;
+       int len = 0;
+
+       pr_debug("rtasd: logging event\n");
+       if (buf == NULL)
+               return;
+
+       spin_lock_irqsave(&rtasd_log_lock, s);
+
+       /* get length and increase count */
+       switch (err_type & ERR_TYPE_MASK) {
+       case ERR_TYPE_RTAS_LOG:
+               len = log_rtas_len(buf);
+               if (!(err_type & ERR_FLAG_BOOT))
+                       error_log_cnt++;
+               break;
+       case ERR_TYPE_KERNEL_PANIC:
+       default:
+               WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
+               spin_unlock_irqrestore(&rtasd_log_lock, s);
+               return;
+       }
+
+#ifdef CONFIG_PPC64
+       /* Write error to NVRAM */
+       if (logging_enabled && !(err_type & ERR_FLAG_BOOT))
+               nvram_write_error_log(buf, len, err_type, error_log_cnt);
+#endif /* CONFIG_PPC64 */
+
+       /*
+        * rtas errors can occur during boot, and we do want to capture
+        * those somewhere, even if nvram isn't ready (why not?), and even
+        * if rtasd isn't ready. Put them into the boot log, at least.
+        */
+       if ((err_type & ERR_TYPE_MASK) == ERR_TYPE_RTAS_LOG)
+               printk_log_rtas(buf, len);
+
+       /* Check to see if we need to or have stopped logging */
+       if (fatal || !logging_enabled) {
+               logging_enabled = 0;
+               WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
+               spin_unlock_irqrestore(&rtasd_log_lock, s);
+               return;
+       }
+
+       /* call type specific method for error */
+       switch (err_type & ERR_TYPE_MASK) {
+       case ERR_TYPE_RTAS_LOG:
+               offset = rtas_error_log_buffer_max *
+                       ((rtas_log_start+rtas_log_size) & LOG_NUMBER_MASK);
+
+               /* First copy over sequence number */
+               memcpy(&rtas_log_buf[offset], (void *) &error_log_cnt, sizeof(int));
+
+               /* Second copy over error log data */
+               offset += sizeof(int);
+               memcpy(&rtas_log_buf[offset], buf, len);
+
+               if (rtas_log_size < LOG_NUMBER)
+                       rtas_log_size += 1;
+               else
+                       rtas_log_start += 1;
+
+               WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
+               spin_unlock_irqrestore(&rtasd_log_lock, s);
+               wake_up_interruptible(&rtas_log_wait);
+               break;
+       case ERR_TYPE_KERNEL_PANIC:
+       default:
+               WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
+               spin_unlock_irqrestore(&rtasd_log_lock, s);
+               return;
+       }
+
+}
+
+static int rtas_log_open(struct inode * inode, struct file * file)
+{
+       return 0;
+}
+
+static int rtas_log_release(struct inode * inode, struct file * file)
+{
+       return 0;
+}
+
+/* This will check if all events are logged, if they are then, we
+ * know that we can safely clear the events in NVRAM.
+ * Next we'll sit and wait for something else to log.
+ */
+static ssize_t rtas_log_read(struct file * file, char __user * buf,
+                        size_t count, loff_t *ppos)
+{
+       int error;
+       char *tmp;
+       unsigned long s;
+       unsigned long offset;
+
+       if (!buf || count < rtas_error_log_buffer_max)
+               return -EINVAL;
+
+       count = rtas_error_log_buffer_max;
+
+       if (!access_ok(VERIFY_WRITE, buf, count))
+               return -EFAULT;
+
+       tmp = kmalloc(count, GFP_KERNEL);
+       if (!tmp)
+               return -ENOMEM;
+
+       spin_lock_irqsave(&rtasd_log_lock, s);
+
+       /* if it's 0, then we know we got the last one (the one in NVRAM) */
+       while (rtas_log_size == 0) {
+               if (file->f_flags & O_NONBLOCK) {
+                       spin_unlock_irqrestore(&rtasd_log_lock, s);
+                       error = -EAGAIN;
+                       goto out;
+               }
+
+               if (!logging_enabled) {
+                       spin_unlock_irqrestore(&rtasd_log_lock, s);
+                       error = -ENODATA;
+                       goto out;
+               }
+#ifdef CONFIG_PPC64
+               nvram_clear_error_log();
+#endif /* CONFIG_PPC64 */
+
+               spin_unlock_irqrestore(&rtasd_log_lock, s);
+               error = wait_event_interruptible(rtas_log_wait, rtas_log_size);
+               if (error)
+                       goto out;
+               spin_lock_irqsave(&rtasd_log_lock, s);
+       }
+
+       offset = rtas_error_log_buffer_max * (rtas_log_start & LOG_NUMBER_MASK);
+       memcpy(tmp, &rtas_log_buf[offset], count);
+
+       rtas_log_start += 1;
+       rtas_log_size -= 1;
+       spin_unlock_irqrestore(&rtasd_log_lock, s);
+
+       error = copy_to_user(buf, tmp, count) ? -EFAULT : count;
+out:
+       kfree(tmp);
+       return error;
+}
+
+static unsigned int rtas_log_poll(struct file *file, poll_table * wait)
+{
+       poll_wait(file, &rtas_log_wait, wait);
+       if (rtas_log_size)
+               return POLLIN | POLLRDNORM;
+       return 0;
+}
+
+static const struct file_operations proc_rtas_log_operations = {
+       .read =         rtas_log_read,
+       .poll =         rtas_log_poll,
+       .open =         rtas_log_open,
+       .release =      rtas_log_release,
+};
+
+static int enable_surveillance(int timeout)
+{
+       int error;
+
+       error = rtas_set_indicator(SURVEILLANCE_TOKEN, 0, timeout);
+
+       if (error == 0)
+               return 0;
+
+       if (error == -EINVAL) {
+               printk(KERN_DEBUG "rtasd: surveillance not supported\n");
+               return 0;
+       }
+
+       printk(KERN_ERR "rtasd: could not update surveillance\n");
+       return -1;
+}
+
+static void do_event_scan(void)
+{
+       int error;
+       do {
+               memset(logdata, 0, rtas_error_log_max);
+               error = rtas_call(event_scan, 4, 1, NULL,
+                                 RTAS_EVENT_SCAN_ALL_EVENTS, 0,
+                                 __pa(logdata), rtas_error_log_max);
+               if (error == -1) {
+                       printk(KERN_ERR "event-scan failed\n");
+                       break;
+               }
+
+               if (error == 0)
+                       pSeries_log_error(logdata, ERR_TYPE_RTAS_LOG, 0);
+
+       } while(error == 0);
+}
+
+static void rtas_event_scan(struct work_struct *w);
+DECLARE_DELAYED_WORK(event_scan_work, rtas_event_scan);
+
+/*
+ * Delay should be at least one second since some machines have problems if
+ * we call event-scan too quickly.
+ */
+static unsigned long event_scan_delay = 1*HZ;
+static int first_pass = 1;
+
+static void rtas_event_scan(struct work_struct *w)
+{
+       unsigned int cpu;
+
+       do_event_scan();
+
+       get_online_cpus();
+
+       cpu = next_cpu(smp_processor_id(), cpu_online_map);
+       if (cpu == NR_CPUS) {
+               cpu = first_cpu(cpu_online_map);
+
+               if (first_pass) {
+                       first_pass = 0;
+                       event_scan_delay = 30*HZ/rtas_event_scan_rate;
+
+                       if (surveillance_timeout != -1) {
+                               pr_debug("rtasd: enabling surveillance\n");
+                               enable_surveillance(surveillance_timeout);
+                               pr_debug("rtasd: surveillance enabled\n");
+                       }
+               }
+       }
+
+       schedule_delayed_work_on(cpu, &event_scan_work,
+               __round_jiffies_relative(event_scan_delay, cpu));
+
+       put_online_cpus();
+}
+
+#ifdef CONFIG_PPC64
+static void retreive_nvram_error_log(void)
+{
+       unsigned int err_type ;
+       int rc ;
+
+       /* See if we have any error stored in NVRAM */
+       memset(logdata, 0, rtas_error_log_max);
+       rc = nvram_read_error_log(logdata, rtas_error_log_max,
+                                 &err_type, &error_log_cnt);
+       /* We can use rtas_log_buf now */
+       logging_enabled = 1;
+       if (!rc) {
+               if (err_type != ERR_FLAG_ALREADY_LOGGED) {
+                       pSeries_log_error(logdata, err_type | ERR_FLAG_BOOT, 0);
+               }
+       }
+}
+#else /* CONFIG_PPC64 */
+static void retreive_nvram_error_log(void)
+{
+}
+#endif /* CONFIG_PPC64 */
+
+static void start_event_scan(void)
+{
+       printk(KERN_DEBUG "RTAS daemon started\n");
+       pr_debug("rtasd: will sleep for %d milliseconds\n",
+                (30000 / rtas_event_scan_rate));
+
+       /* Retreive errors from nvram if any */
+       retreive_nvram_error_log();
+
+       schedule_delayed_work_on(first_cpu(cpu_online_map), &event_scan_work,
+                                event_scan_delay);
+}
+
+static int __init rtas_init(void)
+{
+       struct proc_dir_entry *entry;
+
+       if (!machine_is(pseries) && !machine_is(chrp))
+               return 0;
+
+       /* No RTAS */
+       event_scan = rtas_token("event-scan");
+       if (event_scan == RTAS_UNKNOWN_SERVICE) {
+               printk(KERN_INFO "rtasd: No event-scan on system\n");
+               return -ENODEV;
+       }
+
+       rtas_event_scan_rate = rtas_token("rtas-event-scan-rate");
+       if (rtas_event_scan_rate == RTAS_UNKNOWN_SERVICE) {
+               printk(KERN_ERR "rtasd: no rtas-event-scan-rate on system\n");
+               return -ENODEV;
+       }
+
+       /* Make room for the sequence number */
+       rtas_error_log_max = rtas_get_error_log_max();
+       rtas_error_log_buffer_max = rtas_error_log_max + sizeof(int);
+
+       rtas_log_buf = vmalloc(rtas_error_log_buffer_max*LOG_NUMBER);
+       if (!rtas_log_buf) {
+               printk(KERN_ERR "rtasd: no memory\n");
+               return -ENOMEM;
+       }
+
+       entry = proc_create("powerpc/rtas/error_log", S_IRUSR, NULL,
+                           &proc_rtas_log_operations);
+       if (!entry)
+               printk(KERN_ERR "Failed to create error_log proc entry\n");
+
+       start_event_scan();
+
+       return 0;
+}
+__initcall(rtas_init);
+
+static int __init surveillance_setup(char *str)
+{
+       int i;
+
+       /* We only do surveillance on pseries */
+       if (!machine_is(pseries))
+               return 0;
+
+       if (get_option(&str,&i)) {
+               if (i >= 0 && i <= 255)
+                       surveillance_timeout = i;
+       }
+
+       return 1;
+}
+__setup("surveillance=", surveillance_setup);
+
+static int __init rtasmsgs_setup(char *str)
+{
+       if (strcmp(str, "on") == 0)
+               full_rtas_msgs = 1;
+       else if (strcmp(str, "off") == 0)
+               full_rtas_msgs = 0;
+
+       return 1;
+}
+__setup("rtasmsgs=", rtasmsgs_setup);
index 04f638d82fb3cd3ff5387bfe62f395144e4f7c82..fd785f7a279bbe808781f4b03d5ff51e87fc5f15 100644 (file)
@@ -356,11 +356,6 @@ void __init setup_system(void)
         */
        initialize_cache_info();
 
-       /*
-        * Initialize irq remapping subsystem
-        */
-       irq_early_init();
-
 #ifdef CONFIG_PPC_RTAS
        /*
         * Initialize RTAS if available
index a136a11c490d0f36a9b32812132e682c38262f8f..6c9e20898fa760d68f0508b404b74211cc258f54 100644 (file)
@@ -268,6 +268,7 @@ void account_system_vtime(struct task_struct *tsk)
        per_cpu(cputime_scaled_last_delta, smp_processor_id()) = deltascaled;
        local_irq_restore(flags);
 }
+EXPORT_SYMBOL_GPL(account_system_vtime);
 
 /*
  * Transfer the user and system times accumulated in the paca
index c2992684661389f9686a73b5fbf32e659a2886f9..07703f72330e1f0f468be6a0ce75d4b19a12daa8 100644 (file)
@@ -21,6 +21,23 @@ config KVM
        select PREEMPT_NOTIFIERS
        select ANON_INODES
 
+config KVM_BOOK3S_64_HANDLER
+       bool
+
+config KVM_BOOK3S_64
+       tristate "KVM support for PowerPC book3s_64 processors"
+       depends on EXPERIMENTAL && PPC64
+       select KVM
+       select KVM_BOOK3S_64_HANDLER
+       ---help---
+         Support running unmodified book3s_64 and book3s_32 guest kernels
+         in virtual machines on book3s_64 host processors.
+
+         This module provides access to the hardware capabilities through
+         a character device node named /dev/kvm.
+
+         If unsure, say N.
+
 config KVM_440
        bool "KVM support for PowerPC 440 processors"
        depends on EXPERIMENTAL && 44x
index 37655fe19f2f7431c946d2e6405d3d96aaf10fa1..56484d6523777582f3a44e84850ead22cb237d52 100644 (file)
@@ -12,26 +12,45 @@ CFLAGS_44x_tlb.o  := -I.
 CFLAGS_e500_tlb.o := -I.
 CFLAGS_emulate.o  := -I.
 
-kvm-objs := $(common-objs-y) powerpc.o emulate.o
+common-objs-y += powerpc.o emulate.o
 obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o
-obj-$(CONFIG_KVM) += kvm.o
+obj-$(CONFIG_KVM_BOOK3S_64_HANDLER) += book3s_64_exports.o
 
 AFLAGS_booke_interrupts.o := -I$(obj)
 
 kvm-440-objs := \
+       $(common-objs-y) \
        booke.o \
        booke_emulate.o \
        booke_interrupts.o \
        44x.o \
        44x_tlb.o \
        44x_emulate.o
-obj-$(CONFIG_KVM_440) += kvm-440.o
+kvm-objs-$(CONFIG_KVM_440) := $(kvm-440-objs)
 
 kvm-e500-objs := \
+       $(common-objs-y) \
        booke.o \
        booke_emulate.o \
        booke_interrupts.o \
        e500.o \
        e500_tlb.o \
        e500_emulate.o
-obj-$(CONFIG_KVM_E500) += kvm-e500.o
+kvm-objs-$(CONFIG_KVM_E500) := $(kvm-e500-objs)
+
+kvm-book3s_64-objs := \
+       $(common-objs-y) \
+       book3s.o \
+       book3s_64_emulate.o \
+       book3s_64_interrupts.o \
+       book3s_64_mmu_host.o \
+       book3s_64_mmu.o \
+       book3s_32_mmu.o
+kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-objs)
+
+kvm-objs := $(kvm-objs-m) $(kvm-objs-y)
+
+obj-$(CONFIG_KVM_440) += kvm.o
+obj-$(CONFIG_KVM_E500) += kvm.o
+obj-$(CONFIG_KVM_BOOK3S_64) += kvm.o
+
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
new file mode 100644 (file)
index 0000000..42037d4
--- /dev/null
@@ -0,0 +1,925 @@
+/*
+ * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ *    Alexander Graf <agraf@suse.de>
+ *    Kevin Wolf <mail@kevin-wolf.de>
+ *
+ * Description:
+ * This file is derived from arch/powerpc/kvm/44x.c,
+ * by Hollis Blanchard <hollisb@us.ibm.com>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+
+#include <asm/reg.h>
+#include <asm/cputable.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu_context.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+
+#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
+
+/* #define EXIT_DEBUG */
+/* #define EXIT_DEBUG_SIMPLE */
+
+/* Without AGGRESSIVE_DEC we only fire off a DEC interrupt when DEC turns 0.
+ * When set, we retrigger a DEC interrupt after that if DEC <= 0.
+ * PPC32 Linux runs faster without AGGRESSIVE_DEC, PPC64 Linux requires it. */
+
+/* #define AGGRESSIVE_DEC */
+
+struct kvm_stats_debugfs_item debugfs_entries[] = {
+       { "exits",       VCPU_STAT(sum_exits) },
+       { "mmio",        VCPU_STAT(mmio_exits) },
+       { "sig",         VCPU_STAT(signal_exits) },
+       { "sysc",        VCPU_STAT(syscall_exits) },
+       { "inst_emu",    VCPU_STAT(emulated_inst_exits) },
+       { "dec",         VCPU_STAT(dec_exits) },
+       { "ext_intr",    VCPU_STAT(ext_intr_exits) },
+       { "queue_intr",  VCPU_STAT(queue_intr) },
+       { "halt_wakeup", VCPU_STAT(halt_wakeup) },
+       { "pf_storage",  VCPU_STAT(pf_storage) },
+       { "sp_storage",  VCPU_STAT(sp_storage) },
+       { "pf_instruc",  VCPU_STAT(pf_instruc) },
+       { "sp_instruc",  VCPU_STAT(sp_instruc) },
+       { "ld",          VCPU_STAT(ld) },
+       { "ld_slow",     VCPU_STAT(ld_slow) },
+       { "st",          VCPU_STAT(st) },
+       { "st_slow",     VCPU_STAT(st_slow) },
+       { NULL }
+};
+
+void kvmppc_core_load_host_debugstate(struct kvm_vcpu *vcpu)
+{
+}
+
+void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu)
+{
+}
+
+void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+       memcpy(get_paca()->kvm_slb, to_book3s(vcpu)->slb_shadow, sizeof(get_paca()->kvm_slb));
+       get_paca()->kvm_slb_max = to_book3s(vcpu)->slb_shadow_max;
+}
+
+void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
+{
+       memcpy(to_book3s(vcpu)->slb_shadow, get_paca()->kvm_slb, sizeof(get_paca()->kvm_slb));
+       to_book3s(vcpu)->slb_shadow_max = get_paca()->kvm_slb_max;
+}
+
+#if defined(AGGRESSIVE_DEC) || defined(EXIT_DEBUG)
+static u32 kvmppc_get_dec(struct kvm_vcpu *vcpu)
+{
+       u64 jd = mftb() - vcpu->arch.dec_jiffies;
+       return vcpu->arch.dec - jd;
+}
+#endif
+
+void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
+{
+       ulong old_msr = vcpu->arch.msr;
+
+#ifdef EXIT_DEBUG
+       printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr);
+#endif
+       msr &= to_book3s(vcpu)->msr_mask;
+       vcpu->arch.msr = msr;
+       vcpu->arch.shadow_msr = msr | MSR_USER32;
+       vcpu->arch.shadow_msr &= ( MSR_VEC | MSR_VSX | MSR_FP | MSR_FE0 |
+                                  MSR_USER64 | MSR_SE | MSR_BE | MSR_DE |
+                                  MSR_FE1);
+
+       if (msr & (MSR_WE|MSR_POW)) {
+               if (!vcpu->arch.pending_exceptions) {
+                       kvm_vcpu_block(vcpu);
+                       vcpu->stat.halt_wakeup++;
+               }
+       }
+
+       if (((vcpu->arch.msr & (MSR_IR|MSR_DR)) != (old_msr & (MSR_IR|MSR_DR))) ||
+           (vcpu->arch.msr & MSR_PR) != (old_msr & MSR_PR)) {
+               kvmppc_mmu_flush_segments(vcpu);
+               kvmppc_mmu_map_segment(vcpu, vcpu->arch.pc);
+       }
+}
+
+void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags)
+{
+       vcpu->arch.srr0 = vcpu->arch.pc;
+       vcpu->arch.srr1 = vcpu->arch.msr | flags;
+       vcpu->arch.pc = to_book3s(vcpu)->hior + vec;
+       vcpu->arch.mmu.reset_msr(vcpu);
+}
+
+void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec)
+{
+       unsigned int prio;
+
+       vcpu->stat.queue_intr++;
+       switch (vec) {
+       case 0x100: prio = BOOK3S_IRQPRIO_SYSTEM_RESET;         break;
+       case 0x200: prio = BOOK3S_IRQPRIO_MACHINE_CHECK;        break;
+       case 0x300: prio = BOOK3S_IRQPRIO_DATA_STORAGE;         break;
+       case 0x380: prio = BOOK3S_IRQPRIO_DATA_SEGMENT;         break;
+       case 0x400: prio = BOOK3S_IRQPRIO_INST_STORAGE;         break;
+       case 0x480: prio = BOOK3S_IRQPRIO_INST_SEGMENT;         break;
+       case 0x500: prio = BOOK3S_IRQPRIO_EXTERNAL;             break;
+       case 0x600: prio = BOOK3S_IRQPRIO_ALIGNMENT;            break;
+       case 0x700: prio = BOOK3S_IRQPRIO_PROGRAM;              break;
+       case 0x800: prio = BOOK3S_IRQPRIO_FP_UNAVAIL;           break;
+       case 0x900: prio = BOOK3S_IRQPRIO_DECREMENTER;          break;
+       case 0xc00: prio = BOOK3S_IRQPRIO_SYSCALL;              break;
+       case 0xd00: prio = BOOK3S_IRQPRIO_DEBUG;                break;
+       case 0xf20: prio = BOOK3S_IRQPRIO_ALTIVEC;              break;
+       case 0xf40: prio = BOOK3S_IRQPRIO_VSX;                  break;
+       default:    prio = BOOK3S_IRQPRIO_MAX;                  break;
+       }
+
+       set_bit(prio, &vcpu->arch.pending_exceptions);
+#ifdef EXIT_DEBUG
+       printk(KERN_INFO "Queueing interrupt %x\n", vec);
+#endif
+}
+
+
+void kvmppc_core_queue_program(struct kvm_vcpu *vcpu)
+{
+       kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_PROGRAM);
+}
+
+void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu)
+{
+       kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DECREMENTER);
+}
+
+int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu)
+{
+       return test_bit(BOOK3S_INTERRUPT_DECREMENTER >> 7, &vcpu->arch.pending_exceptions);
+}
+
+void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
+                                struct kvm_interrupt *irq)
+{
+       kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL);
+}
+
+int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
+{
+       int deliver = 1;
+       int vec = 0;
+
+       switch (priority) {
+       case BOOK3S_IRQPRIO_DECREMENTER:
+               deliver = vcpu->arch.msr & MSR_EE;
+               vec = BOOK3S_INTERRUPT_DECREMENTER;
+               break;
+       case BOOK3S_IRQPRIO_EXTERNAL:
+               deliver = vcpu->arch.msr & MSR_EE;
+               vec = BOOK3S_INTERRUPT_EXTERNAL;
+               break;
+       case BOOK3S_IRQPRIO_SYSTEM_RESET:
+               vec = BOOK3S_INTERRUPT_SYSTEM_RESET;
+               break;
+       case BOOK3S_IRQPRIO_MACHINE_CHECK:
+               vec = BOOK3S_INTERRUPT_MACHINE_CHECK;
+               break;
+       case BOOK3S_IRQPRIO_DATA_STORAGE:
+               vec = BOOK3S_INTERRUPT_DATA_STORAGE;
+               break;
+       case BOOK3S_IRQPRIO_INST_STORAGE:
+               vec = BOOK3S_INTERRUPT_INST_STORAGE;
+               break;
+       case BOOK3S_IRQPRIO_DATA_SEGMENT:
+               vec = BOOK3S_INTERRUPT_DATA_SEGMENT;
+               break;
+       case BOOK3S_IRQPRIO_INST_SEGMENT:
+               vec = BOOK3S_INTERRUPT_INST_SEGMENT;
+               break;
+       case BOOK3S_IRQPRIO_ALIGNMENT:
+               vec = BOOK3S_INTERRUPT_ALIGNMENT;
+               break;
+       case BOOK3S_IRQPRIO_PROGRAM:
+               vec = BOOK3S_INTERRUPT_PROGRAM;
+               break;
+       case BOOK3S_IRQPRIO_VSX:
+               vec = BOOK3S_INTERRUPT_VSX;
+               break;
+       case BOOK3S_IRQPRIO_ALTIVEC:
+               vec = BOOK3S_INTERRUPT_ALTIVEC;
+               break;
+       case BOOK3S_IRQPRIO_FP_UNAVAIL:
+               vec = BOOK3S_INTERRUPT_FP_UNAVAIL;
+               break;
+       case BOOK3S_IRQPRIO_SYSCALL:
+               vec = BOOK3S_INTERRUPT_SYSCALL;
+               break;
+       case BOOK3S_IRQPRIO_DEBUG:
+               vec = BOOK3S_INTERRUPT_TRACE;
+               break;
+       case BOOK3S_IRQPRIO_PERFORMANCE_MONITOR:
+               vec = BOOK3S_INTERRUPT_PERFMON;
+               break;
+       default:
+               deliver = 0;
+               printk(KERN_ERR "KVM: Unknown interrupt: 0x%x\n", priority);
+               break;
+       }
+
+#if 0
+       printk(KERN_INFO "Deliver interrupt 0x%x? %x\n", vec, deliver);
+#endif
+
+       if (deliver)
+               kvmppc_inject_interrupt(vcpu, vec, 0ULL);
+
+       return deliver;
+}
+
+void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
+{
+       unsigned long *pending = &vcpu->arch.pending_exceptions;
+       unsigned int priority;
+
+       /* XXX be more clever here - no need to mftb() on every entry */
+       /* Issue DEC again if it's still active */
+#ifdef AGGRESSIVE_DEC
+       if (vcpu->arch.msr & MSR_EE)
+               if (kvmppc_get_dec(vcpu) & 0x80000000)
+                       kvmppc_core_queue_dec(vcpu);
+#endif
+
+#ifdef EXIT_DEBUG
+       if (vcpu->arch.pending_exceptions)
+               printk(KERN_EMERG "KVM: Check pending: %lx\n", vcpu->arch.pending_exceptions);
+#endif
+       priority = __ffs(*pending);
+       while (priority <= (sizeof(unsigned int) * 8)) {
+               if (kvmppc_book3s_irqprio_deliver(vcpu, priority)) {
+                       clear_bit(priority, &vcpu->arch.pending_exceptions);
+                       break;
+               }
+
+               priority = find_next_bit(pending,
+                                        BITS_PER_BYTE * sizeof(*pending),
+                                        priority + 1);
+       }
+}
+
+void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr)
+{
+       vcpu->arch.pvr = pvr;
+       if ((pvr >= 0x330000) && (pvr < 0x70330000)) {
+               kvmppc_mmu_book3s_64_init(vcpu);
+               to_book3s(vcpu)->hior = 0xfff00000;
+               to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL;
+       } else {
+               kvmppc_mmu_book3s_32_init(vcpu);
+               to_book3s(vcpu)->hior = 0;
+               to_book3s(vcpu)->msr_mask = 0xffffffffULL;
+       }
+
+       /* If we are in hypervisor level on 970, we can tell the CPU to
+        * treat DCBZ as 32 bytes store */
+       vcpu->arch.hflags &= ~BOOK3S_HFLAG_DCBZ32;
+       if (vcpu->arch.mmu.is_dcbz32(vcpu) && (mfmsr() & MSR_HV) &&
+           !strcmp(cur_cpu_spec->platform, "ppc970"))
+               vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
+
+}
+
+/* Book3s_32 CPUs always have 32 bytes cache line size, which Linux assumes. To
+ * make Book3s_32 Linux work on Book3s_64, we have to make sure we trap dcbz to
+ * emulate 32 bytes dcbz length.
+ *
+ * The Book3s_64 inventors also realized this case and implemented a special bit
+ * in the HID5 register, which is a hypervisor ressource. Thus we can't use it.
+ *
+ * My approach here is to patch the dcbz instruction on executing pages.
+ */
+static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
+{
+       bool touched = false;
+       hva_t hpage;
+       u32 *page;
+       int i;
+
+       hpage = gfn_to_hva(vcpu->kvm, pte->raddr >> PAGE_SHIFT);
+       if (kvm_is_error_hva(hpage))
+               return;
+
+       hpage |= pte->raddr & ~PAGE_MASK;
+       hpage &= ~0xFFFULL;
+
+       page = vmalloc(HW_PAGE_SIZE);
+
+       if (copy_from_user(page, (void __user *)hpage, HW_PAGE_SIZE))
+               goto out;
+
+       for (i=0; i < HW_PAGE_SIZE / 4; i++)
+               if ((page[i] & 0xff0007ff) == INS_DCBZ) {
+                       page[i] &= 0xfffffff7; // reserved instruction, so we trap
+                       touched = true;
+               }
+
+       if (touched)
+               copy_to_user((void __user *)hpage, page, HW_PAGE_SIZE);
+
+out:
+       vfree(page);
+}
+
+static int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, bool data,
+                        struct kvmppc_pte *pte)
+{
+       int relocated = (vcpu->arch.msr & (data ? MSR_DR : MSR_IR));
+       int r;
+
+       if (relocated) {
+               r = vcpu->arch.mmu.xlate(vcpu, eaddr, pte, data);
+       } else {
+               pte->eaddr = eaddr;
+               pte->raddr = eaddr & 0xffffffff;
+               pte->vpage = eaddr >> 12;
+               switch (vcpu->arch.msr & (MSR_DR|MSR_IR)) {
+               case 0:
+                       pte->vpage |= VSID_REAL;
+               case MSR_DR:
+                       pte->vpage |= VSID_REAL_DR;
+               case MSR_IR:
+                       pte->vpage |= VSID_REAL_IR;
+               }
+               pte->may_read = true;
+               pte->may_write = true;
+               pte->may_execute = true;
+               r = 0;
+       }
+
+       return r;
+}
+
+static hva_t kvmppc_bad_hva(void)
+{
+       return PAGE_OFFSET;
+}
+
+static hva_t kvmppc_pte_to_hva(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte,
+                              bool read)
+{
+       hva_t hpage;
+
+       if (read && !pte->may_read)
+               goto err;
+
+       if (!read && !pte->may_write)
+               goto err;
+
+       hpage = gfn_to_hva(vcpu->kvm, pte->raddr >> PAGE_SHIFT);
+       if (kvm_is_error_hva(hpage))
+               goto err;
+
+       return hpage | (pte->raddr & ~PAGE_MASK);
+err:
+       return kvmppc_bad_hva();
+}
+
+int kvmppc_st(struct kvm_vcpu *vcpu, ulong eaddr, int size, void *ptr)
+{
+       struct kvmppc_pte pte;
+       hva_t hva = eaddr;
+
+       vcpu->stat.st++;
+
+       if (kvmppc_xlate(vcpu, eaddr, false, &pte))
+               goto err;
+
+       hva = kvmppc_pte_to_hva(vcpu, &pte, false);
+       if (kvm_is_error_hva(hva))
+               goto err;
+
+       if (copy_to_user((void __user *)hva, ptr, size)) {
+               printk(KERN_INFO "kvmppc_st at 0x%lx failed\n", hva);
+               goto err;
+       }
+
+       return 0;
+
+err:
+       return -ENOENT;
+}
+
+int kvmppc_ld(struct kvm_vcpu *vcpu, ulong eaddr, int size, void *ptr,
+                     bool data)
+{
+       struct kvmppc_pte pte;
+       hva_t hva = eaddr;
+
+       vcpu->stat.ld++;
+
+       if (kvmppc_xlate(vcpu, eaddr, data, &pte))
+               goto err;
+
+       hva = kvmppc_pte_to_hva(vcpu, &pte, true);
+       if (kvm_is_error_hva(hva))
+               goto err;
+
+       if (copy_from_user(ptr, (void __user *)hva, size)) {
+               printk(KERN_INFO "kvmppc_ld at 0x%lx failed\n", hva);
+               goto err;
+       }
+
+       return 0;
+
+err:
+       return -ENOENT;
+}
+
+static int kvmppc_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+       return kvm_is_visible_gfn(vcpu->kvm, gfn);
+}
+
+int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                           ulong eaddr, int vec)
+{
+       bool data = (vec == BOOK3S_INTERRUPT_DATA_STORAGE);
+       int r = RESUME_GUEST;
+       int relocated;
+       int page_found = 0;
+       struct kvmppc_pte pte;
+       bool is_mmio = false;
+
+       if ( vec == BOOK3S_INTERRUPT_DATA_STORAGE ) {
+               relocated = (vcpu->arch.msr & MSR_DR);
+       } else {
+               relocated = (vcpu->arch.msr & MSR_IR);
+       }
+
+       /* Resolve real address if translation turned on */
+       if (relocated) {
+               page_found = vcpu->arch.mmu.xlate(vcpu, eaddr, &pte, data);
+       } else {
+               pte.may_execute = true;
+               pte.may_read = true;
+               pte.may_write = true;
+               pte.raddr = eaddr & 0xffffffff;
+               pte.eaddr = eaddr;
+               pte.vpage = eaddr >> 12;
+               switch (vcpu->arch.msr & (MSR_DR|MSR_IR)) {
+               case 0:
+                       pte.vpage |= VSID_REAL;
+               case MSR_DR:
+                       pte.vpage |= VSID_REAL_DR;
+               case MSR_IR:
+                       pte.vpage |= VSID_REAL_IR;
+               }
+       }
+
+       if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
+          (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) {
+               /*
+                * If we do the dcbz hack, we have to NX on every execution,
+                * so we can patch the executing code. This renders our guest
+                * NX-less.
+                */
+               pte.may_execute = !data;
+       }
+
+       if (page_found == -ENOENT) {
+               /* Page not found in guest PTE entries */
+               vcpu->arch.dear = vcpu->arch.fault_dear;
+               to_book3s(vcpu)->dsisr = vcpu->arch.fault_dsisr;
+               vcpu->arch.msr |= (vcpu->arch.shadow_msr & 0x00000000f8000000ULL);
+               kvmppc_book3s_queue_irqprio(vcpu, vec);
+       } else if (page_found == -EPERM) {
+               /* Storage protection */
+               vcpu->arch.dear = vcpu->arch.fault_dear;
+               to_book3s(vcpu)->dsisr = vcpu->arch.fault_dsisr & ~DSISR_NOHPTE;
+               to_book3s(vcpu)->dsisr |= DSISR_PROTFAULT;
+               vcpu->arch.msr |= (vcpu->arch.shadow_msr & 0x00000000f8000000ULL);
+               kvmppc_book3s_queue_irqprio(vcpu, vec);
+       } else if (page_found == -EINVAL) {
+               /* Page not found in guest SLB */
+               vcpu->arch.dear = vcpu->arch.fault_dear;
+               kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80);
+       } else if (!is_mmio &&
+                  kvmppc_visible_gfn(vcpu, pte.raddr >> PAGE_SHIFT)) {
+               /* The guest's PTE is not mapped yet. Map on the host */
+               kvmppc_mmu_map_page(vcpu, &pte);
+               if (data)
+                       vcpu->stat.sp_storage++;
+               else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
+                       (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32)))
+                       kvmppc_patch_dcbz(vcpu, &pte);
+       } else {
+               /* MMIO */
+               vcpu->stat.mmio_exits++;
+               vcpu->arch.paddr_accessed = pte.raddr;
+               r = kvmppc_emulate_mmio(run, vcpu);
+               if ( r == RESUME_HOST_NV )
+                       r = RESUME_HOST;
+               if ( r == RESUME_GUEST_NV )
+                       r = RESUME_GUEST;
+       }
+
+       return r;
+}
+
+int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                       unsigned int exit_nr)
+{
+       int r = RESUME_HOST;
+
+       vcpu->stat.sum_exits++;
+
+       run->exit_reason = KVM_EXIT_UNKNOWN;
+       run->ready_for_interrupt_injection = 1;
+#ifdef EXIT_DEBUG
+       printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | dar=0x%lx | dec=0x%x | msr=0x%lx\n",
+               exit_nr, vcpu->arch.pc, vcpu->arch.fault_dear,
+               kvmppc_get_dec(vcpu), vcpu->arch.msr);
+#elif defined (EXIT_DEBUG_SIMPLE)
+       if ((exit_nr != 0x900) && (exit_nr != 0x500))
+               printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | dar=0x%lx | msr=0x%lx\n",
+                       exit_nr, vcpu->arch.pc, vcpu->arch.fault_dear,
+                       vcpu->arch.msr);
+#endif
+       kvm_resched(vcpu);
+       switch (exit_nr) {
+       case BOOK3S_INTERRUPT_INST_STORAGE:
+               vcpu->stat.pf_instruc++;
+               /* only care about PTEG not found errors, but leave NX alone */
+               if (vcpu->arch.shadow_msr & 0x40000000) {
+                       r = kvmppc_handle_pagefault(run, vcpu, vcpu->arch.pc, exit_nr);
+                       vcpu->stat.sp_instruc++;
+               } else if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
+                         (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) {
+                       /*
+                        * XXX If we do the dcbz hack we use the NX bit to flush&patch the page,
+                        *     so we can't use the NX bit inside the guest. Let's cross our fingers,
+                        *     that no guest that needs the dcbz hack does NX.
+                        */
+                       kvmppc_mmu_pte_flush(vcpu, vcpu->arch.pc, ~0xFFFULL);
+               } else {
+                       vcpu->arch.msr |= (vcpu->arch.shadow_msr & 0x58000000);
+                       kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+                       kvmppc_mmu_pte_flush(vcpu, vcpu->arch.pc, ~0xFFFULL);
+                       r = RESUME_GUEST;
+               }
+               break;
+       case BOOK3S_INTERRUPT_DATA_STORAGE:
+               vcpu->stat.pf_storage++;
+               /* The only case we need to handle is missing shadow PTEs */
+               if (vcpu->arch.fault_dsisr & DSISR_NOHPTE) {
+                       r = kvmppc_handle_pagefault(run, vcpu, vcpu->arch.fault_dear, exit_nr);
+               } else {
+                       vcpu->arch.dear = vcpu->arch.fault_dear;
+                       to_book3s(vcpu)->dsisr = vcpu->arch.fault_dsisr;
+                       kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+                       kvmppc_mmu_pte_flush(vcpu, vcpu->arch.dear, ~0xFFFULL);
+                       r = RESUME_GUEST;
+               }
+               break;
+       case BOOK3S_INTERRUPT_DATA_SEGMENT:
+               if (kvmppc_mmu_map_segment(vcpu, vcpu->arch.fault_dear) < 0) {
+                       vcpu->arch.dear = vcpu->arch.fault_dear;
+                       kvmppc_book3s_queue_irqprio(vcpu,
+                               BOOK3S_INTERRUPT_DATA_SEGMENT);
+               }
+               r = RESUME_GUEST;
+               break;
+       case BOOK3S_INTERRUPT_INST_SEGMENT:
+               if (kvmppc_mmu_map_segment(vcpu, vcpu->arch.pc) < 0) {
+                       kvmppc_book3s_queue_irqprio(vcpu,
+                               BOOK3S_INTERRUPT_INST_SEGMENT);
+               }
+               r = RESUME_GUEST;
+               break;
+       /* We're good on these - the host merely wanted to get our attention */
+       case BOOK3S_INTERRUPT_DECREMENTER:
+               vcpu->stat.dec_exits++;
+               r = RESUME_GUEST;
+               break;
+       case BOOK3S_INTERRUPT_EXTERNAL:
+               vcpu->stat.ext_intr_exits++;
+               r = RESUME_GUEST;
+               break;
+       case BOOK3S_INTERRUPT_PROGRAM:
+       {
+               enum emulation_result er;
+
+               if (vcpu->arch.msr & MSR_PR) {
+#ifdef EXIT_DEBUG
+                       printk(KERN_INFO "Userspace triggered 0x700 exception at 0x%lx (0x%x)\n", vcpu->arch.pc, vcpu->arch.last_inst);
+#endif
+                       if ((vcpu->arch.last_inst & 0xff0007ff) !=
+                           (INS_DCBZ & 0xfffffff7)) {
+                               kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+                               r = RESUME_GUEST;
+                               break;
+                       }
+               }
+
+               vcpu->stat.emulated_inst_exits++;
+               er = kvmppc_emulate_instruction(run, vcpu);
+               switch (er) {
+               case EMULATE_DONE:
+                       r = RESUME_GUEST;
+                       break;
+               case EMULATE_FAIL:
+                       printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n",
+                              __func__, vcpu->arch.pc, vcpu->arch.last_inst);
+                       kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+                       r = RESUME_GUEST;
+                       break;
+               default:
+                       BUG();
+               }
+               break;
+       }
+       case BOOK3S_INTERRUPT_SYSCALL:
+#ifdef EXIT_DEBUG
+               printk(KERN_INFO "Syscall Nr %d\n", (int)vcpu->arch.gpr[0]);
+#endif
+               vcpu->stat.syscall_exits++;
+               kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+               r = RESUME_GUEST;
+               break;
+       case BOOK3S_INTERRUPT_MACHINE_CHECK:
+       case BOOK3S_INTERRUPT_FP_UNAVAIL:
+       case BOOK3S_INTERRUPT_TRACE:
+       case BOOK3S_INTERRUPT_ALTIVEC:
+       case BOOK3S_INTERRUPT_VSX:
+               kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+               r = RESUME_GUEST;
+               break;
+       default:
+               /* Ugh - bork here! What did we get? */
+               printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n", exit_nr, vcpu->arch.pc, vcpu->arch.shadow_msr);
+               r = RESUME_HOST;
+               BUG();
+               break;
+       }
+
+
+       if (!(r & RESUME_HOST)) {
+               /* To avoid clobbering exit_reason, only check for signals if
+                * we aren't already exiting to userspace for some other
+                * reason. */
+               if (signal_pending(current)) {
+#ifdef EXIT_DEBUG
+                       printk(KERN_EMERG "KVM: Going back to host\n");
+#endif
+                       vcpu->stat.signal_exits++;
+                       run->exit_reason = KVM_EXIT_INTR;
+                       r = -EINTR;
+               } else {
+                       /* In case an interrupt came in that was triggered
+                        * from userspace (like DEC), we need to check what
+                        * to inject now! */
+                       kvmppc_core_deliver_interrupts(vcpu);
+               }
+       }
+
+#ifdef EXIT_DEBUG
+       printk(KERN_EMERG "KVM exit: vcpu=0x%p pc=0x%lx r=0x%x\n", vcpu, vcpu->arch.pc, r);
+#endif
+
+       return r;
+}
+
+int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+       return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+       int i;
+
+       regs->pc = vcpu->arch.pc;
+       regs->cr = vcpu->arch.cr;
+       regs->ctr = vcpu->arch.ctr;
+       regs->lr = vcpu->arch.lr;
+       regs->xer = vcpu->arch.xer;
+       regs->msr = vcpu->arch.msr;
+       regs->srr0 = vcpu->arch.srr0;
+       regs->srr1 = vcpu->arch.srr1;
+       regs->pid = vcpu->arch.pid;
+       regs->sprg0 = vcpu->arch.sprg0;
+       regs->sprg1 = vcpu->arch.sprg1;
+       regs->sprg2 = vcpu->arch.sprg2;
+       regs->sprg3 = vcpu->arch.sprg3;
+       regs->sprg5 = vcpu->arch.sprg4;
+       regs->sprg6 = vcpu->arch.sprg5;
+       regs->sprg7 = vcpu->arch.sprg6;
+
+       for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
+               regs->gpr[i] = vcpu->arch.gpr[i];
+
+       return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+       int i;
+
+       vcpu->arch.pc = regs->pc;
+       vcpu->arch.cr = regs->cr;
+       vcpu->arch.ctr = regs->ctr;
+       vcpu->arch.lr = regs->lr;
+       vcpu->arch.xer = regs->xer;
+       kvmppc_set_msr(vcpu, regs->msr);
+       vcpu->arch.srr0 = regs->srr0;
+       vcpu->arch.srr1 = regs->srr1;
+       vcpu->arch.sprg0 = regs->sprg0;
+       vcpu->arch.sprg1 = regs->sprg1;
+       vcpu->arch.sprg2 = regs->sprg2;
+       vcpu->arch.sprg3 = regs->sprg3;
+       vcpu->arch.sprg5 = regs->sprg4;
+       vcpu->arch.sprg6 = regs->sprg5;
+       vcpu->arch.sprg7 = regs->sprg6;
+
+       for (i = 0; i < ARRAY_SIZE(vcpu->arch.gpr); i++)
+               vcpu->arch.gpr[i] = regs->gpr[i];
+
+       return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+                                  struct kvm_sregs *sregs)
+{
+       sregs->pvr = vcpu->arch.pvr;
+       return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+                                  struct kvm_sregs *sregs)
+{
+       kvmppc_set_pvr(vcpu, sregs->pvr);
+       return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+       return -ENOTSUPP;
+}
+
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+       return -ENOTSUPP;
+}
+
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+                                  struct kvm_translation *tr)
+{
+       return 0;
+}
+
+/*
+ * Get (and clear) the dirty memory log for a memory slot.
+ */
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
+                                     struct kvm_dirty_log *log)
+{
+       struct kvm_memory_slot *memslot;
+       struct kvm_vcpu *vcpu;
+       ulong ga, ga_end;
+       int is_dirty = 0;
+       int r, n;
+
+       down_write(&kvm->slots_lock);
+
+       r = kvm_get_dirty_log(kvm, log, &is_dirty);
+       if (r)
+               goto out;
+
+       /* If nothing is dirty, don't bother messing with page tables. */
+       if (is_dirty) {
+               memslot = &kvm->memslots[log->slot];
+
+               ga = memslot->base_gfn << PAGE_SHIFT;
+               ga_end = ga + (memslot->npages << PAGE_SHIFT);
+
+               kvm_for_each_vcpu(n, vcpu, kvm)
+                       kvmppc_mmu_pte_pflush(vcpu, ga, ga_end);
+
+               n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
+               memset(memslot->dirty_bitmap, 0, n);
+       }
+
+       r = 0;
+out:
+       up_write(&kvm->slots_lock);
+       return r;
+}
+
+int kvmppc_core_check_processor_compat(void)
+{
+       return 0;
+}
+
+struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
+{
+       struct kvmppc_vcpu_book3s *vcpu_book3s;
+       struct kvm_vcpu *vcpu;
+       int err;
+
+       vcpu_book3s = (struct kvmppc_vcpu_book3s *)__get_free_pages( GFP_KERNEL | __GFP_ZERO,
+                       get_order(sizeof(struct kvmppc_vcpu_book3s)));
+       if (!vcpu_book3s) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       vcpu = &vcpu_book3s->vcpu;
+       err = kvm_vcpu_init(vcpu, kvm, id);
+       if (err)
+               goto free_vcpu;
+
+       vcpu->arch.host_retip = kvm_return_point;
+       vcpu->arch.host_msr = mfmsr();
+       /* default to book3s_64 (970fx) */
+       vcpu->arch.pvr = 0x3C0301;
+       kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
+       vcpu_book3s->slb_nr = 64;
+
+       /* remember where some real-mode handlers are */
+       vcpu->arch.trampoline_lowmem = kvmppc_trampoline_lowmem;
+       vcpu->arch.trampoline_enter = kvmppc_trampoline_enter;
+       vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem;
+
+       vcpu->arch.shadow_msr = MSR_USER64;
+
+       err = __init_new_context();
+       if (err < 0)
+               goto free_vcpu;
+       vcpu_book3s->context_id = err;
+
+       vcpu_book3s->vsid_max = ((vcpu_book3s->context_id + 1) << USER_ESID_BITS) - 1;
+       vcpu_book3s->vsid_first = vcpu_book3s->context_id << USER_ESID_BITS;
+       vcpu_book3s->vsid_next = vcpu_book3s->vsid_first;
+
+       return vcpu;
+
+free_vcpu:
+       free_pages((long)vcpu_book3s, get_order(sizeof(struct kvmppc_vcpu_book3s)));
+out:
+       return ERR_PTR(err);
+}
+
+void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
+{
+       struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+
+       __destroy_context(vcpu_book3s->context_id);
+       kvm_vcpu_uninit(vcpu);
+       free_pages((long)vcpu_book3s, get_order(sizeof(struct kvmppc_vcpu_book3s)));
+}
+
+extern int __kvmppc_vcpu_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
+int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+{
+       int ret;
+
+       /* No need to go into the guest when all we do is going out */
+       if (signal_pending(current)) {
+               kvm_run->exit_reason = KVM_EXIT_INTR;
+               return -EINTR;
+       }
+
+       /* XXX we get called with irq disabled - change that! */
+       local_irq_enable();
+
+       ret = __kvmppc_vcpu_entry(kvm_run, vcpu);
+
+       local_irq_disable();
+
+       return ret;
+}
+
+static int kvmppc_book3s_init(void)
+{
+       return kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), THIS_MODULE);
+}
+
+static void kvmppc_book3s_exit(void)
+{
+       kvm_exit();
+}
+
+module_init(kvmppc_book3s_init);
+module_exit(kvmppc_book3s_exit);
diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c
new file mode 100644 (file)
index 0000000..faf99f2
--- /dev/null
@@ -0,0 +1,372 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+
+#include <asm/tlbflush.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+
+/* #define DEBUG_MMU */
+/* #define DEBUG_MMU_PTE */
+/* #define DEBUG_MMU_PTE_IP 0xfff14c40 */
+
+#ifdef DEBUG_MMU
+#define dprintk(X...) printk(KERN_INFO X)
+#else
+#define dprintk(X...) do { } while(0)
+#endif
+
+#ifdef DEBUG_PTE
+#define dprintk_pte(X...) printk(KERN_INFO X)
+#else
+#define dprintk_pte(X...) do { } while(0)
+#endif
+
+#define PTEG_FLAG_ACCESSED     0x00000100
+#define PTEG_FLAG_DIRTY                0x00000080
+
+static inline bool check_debug_ip(struct kvm_vcpu *vcpu)
+{
+#ifdef DEBUG_MMU_PTE_IP
+       return vcpu->arch.pc == DEBUG_MMU_PTE_IP;
+#else
+       return true;
+#endif
+}
+
+static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr,
+                                         struct kvmppc_pte *pte, bool data);
+
+static struct kvmppc_sr *find_sr(struct kvmppc_vcpu_book3s *vcpu_book3s, gva_t eaddr)
+{
+       return &vcpu_book3s->sr[(eaddr >> 28) & 0xf];
+}
+
+static u64 kvmppc_mmu_book3s_32_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr,
+                                        bool data)
+{
+       struct kvmppc_sr *sre = find_sr(to_book3s(vcpu), eaddr);
+       struct kvmppc_pte pte;
+
+       if (!kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, &pte, data))
+               return pte.vpage;
+
+       return (((u64)eaddr >> 12) & 0xffff) | (((u64)sre->vsid) << 16);
+}
+
+static void kvmppc_mmu_book3s_32_reset_msr(struct kvm_vcpu *vcpu)
+{
+       kvmppc_set_msr(vcpu, 0);
+}
+
+static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvmppc_vcpu_book3s *vcpu_book3s,
+                                     struct kvmppc_sr *sre, gva_t eaddr,
+                                     bool primary)
+{
+       u32 page, hash, pteg, htabmask;
+       hva_t r;
+
+       page = (eaddr & 0x0FFFFFFF) >> 12;
+       htabmask = ((vcpu_book3s->sdr1 & 0x1FF) << 16) | 0xFFC0;
+
+       hash = ((sre->vsid ^ page) << 6);
+       if (!primary)
+               hash = ~hash;
+       hash &= htabmask;
+
+       pteg = (vcpu_book3s->sdr1 & 0xffff0000) | hash;
+
+       dprintk("MMU: pc=0x%lx eaddr=0x%lx sdr1=0x%llx pteg=0x%x vsid=0x%x\n",
+               vcpu_book3s->vcpu.arch.pc, eaddr, vcpu_book3s->sdr1, pteg,
+               sre->vsid);
+
+       r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT);
+       if (kvm_is_error_hva(r))
+               return r;
+       return r | (pteg & ~PAGE_MASK);
+}
+
+static u32 kvmppc_mmu_book3s_32_get_ptem(struct kvmppc_sr *sre, gva_t eaddr,
+                                   bool primary)
+{
+       return ((eaddr & 0x0fffffff) >> 22) | (sre->vsid << 7) |
+              (primary ? 0 : 0x40) | 0x80000000;
+}
+
+static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr,
+                                         struct kvmppc_pte *pte, bool data)
+{
+       struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+       struct kvmppc_bat *bat;
+       int i;
+
+       for (i = 0; i < 8; i++) {
+               if (data)
+                       bat = &vcpu_book3s->dbat[i];
+               else
+                       bat = &vcpu_book3s->ibat[i];
+
+               if (vcpu->arch.msr & MSR_PR) {
+                       if (!bat->vp)
+                               continue;
+               } else {
+                       if (!bat->vs)
+                               continue;
+               }
+
+               if (check_debug_ip(vcpu))
+               {
+                       dprintk_pte("%cBAT %02d: 0x%lx - 0x%x (0x%x)\n",
+                                   data ? 'd' : 'i', i, eaddr, bat->bepi,
+                                   bat->bepi_mask);
+               }
+               if ((eaddr & bat->bepi_mask) == bat->bepi) {
+                       pte->raddr = bat->brpn | (eaddr & ~bat->bepi_mask);
+                       pte->vpage = (eaddr >> 12) | VSID_BAT;
+                       pte->may_read = bat->pp;
+                       pte->may_write = bat->pp > 1;
+                       pte->may_execute = true;
+                       if (!pte->may_read) {
+                               printk(KERN_INFO "BAT is not readable!\n");
+                               continue;
+                       }
+                       if (!pte->may_write) {
+                               /* let's treat r/o BATs as not-readable for now */
+                               dprintk_pte("BAT is read-only!\n");
+                               continue;
+                       }
+
+                       return 0;
+               }
+       }
+
+       return -ENOENT;
+}
+
+static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr,
+                                    struct kvmppc_pte *pte, bool data,
+                                    bool primary)
+{
+       struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+       struct kvmppc_sr *sre;
+       hva_t ptegp;
+       u32 pteg[16];
+       u64 ptem = 0;
+       int i;
+       int found = 0;
+
+       sre = find_sr(vcpu_book3s, eaddr);
+
+       dprintk_pte("SR 0x%lx: vsid=0x%x, raw=0x%x\n", eaddr >> 28,
+                   sre->vsid, sre->raw);
+
+       pte->vpage = kvmppc_mmu_book3s_32_ea_to_vp(vcpu, eaddr, data);
+
+       ptegp = kvmppc_mmu_book3s_32_get_pteg(vcpu_book3s, sre, eaddr, primary);
+       if (kvm_is_error_hva(ptegp)) {
+               printk(KERN_INFO "KVM: Invalid PTEG!\n");
+               goto no_page_found;
+       }
+
+       ptem = kvmppc_mmu_book3s_32_get_ptem(sre, eaddr, primary);
+
+       if(copy_from_user(pteg, (void __user *)ptegp, sizeof(pteg))) {
+               printk(KERN_ERR "KVM: Can't copy data from 0x%lx!\n", ptegp);
+               goto no_page_found;
+       }
+
+       for (i=0; i<16; i+=2) {
+               if (ptem == pteg[i]) {
+                       u8 pp;
+
+                       pte->raddr = (pteg[i+1] & ~(0xFFFULL)) | (eaddr & 0xFFF);
+                       pp = pteg[i+1] & 3;
+
+                       if ((sre->Kp &&  (vcpu->arch.msr & MSR_PR)) ||
+                           (sre->Ks && !(vcpu->arch.msr & MSR_PR)))
+                               pp |= 4;
+
+                       pte->may_write = false;
+                       pte->may_read = false;
+                       pte->may_execute = true;
+                       switch (pp) {
+                               case 0:
+                               case 1:
+                               case 2:
+                               case 6:
+                                       pte->may_write = true;
+                               case 3:
+                               case 5:
+                               case 7:
+                                       pte->may_read = true;
+                                       break;
+                       }
+
+                       if ( !pte->may_read )
+                               continue;
+
+                       dprintk_pte("MMU: Found PTE -> %x %x - %x\n",
+                                   pteg[i], pteg[i+1], pp);
+                       found = 1;
+                       break;
+               }
+       }
+
+       /* Update PTE C and A bits, so the guest's swapper knows we used the
+          page */
+       if (found) {
+               u32 oldpte = pteg[i+1];
+
+               if (pte->may_read)
+                       pteg[i+1] |= PTEG_FLAG_ACCESSED;
+               if (pte->may_write)
+                       pteg[i+1] |= PTEG_FLAG_DIRTY;
+               else
+                       dprintk_pte("KVM: Mapping read-only page!\n");
+
+               /* Write back into the PTEG */
+               if (pteg[i+1] != oldpte)
+                       copy_to_user((void __user *)ptegp, pteg, sizeof(pteg));
+
+               return 0;
+       }
+
+no_page_found:
+
+       if (check_debug_ip(vcpu)) {
+               dprintk_pte("KVM MMU: No PTE found (sdr1=0x%llx ptegp=0x%lx)\n",
+                           to_book3s(vcpu)->sdr1, ptegp);
+               for (i=0; i<16; i+=2) {
+                       dprintk_pte("   %02d: 0x%x - 0x%x (0x%llx)\n",
+                                   i, pteg[i], pteg[i+1], ptem);
+               }
+       }
+
+       return -ENOENT;
+}
+
+static int kvmppc_mmu_book3s_32_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
+                                     struct kvmppc_pte *pte, bool data)
+{
+       int r;
+
+       pte->eaddr = eaddr;
+       r = kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, pte, data);
+       if (r < 0)
+              r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, data, true);
+       if (r < 0)
+              r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, data, false);
+
+       return r;
+}
+
+
+static u32 kvmppc_mmu_book3s_32_mfsrin(struct kvm_vcpu *vcpu, u32 srnum)
+{
+       return to_book3s(vcpu)->sr[srnum].raw;
+}
+
+static void kvmppc_mmu_book3s_32_mtsrin(struct kvm_vcpu *vcpu, u32 srnum,
+                                       ulong value)
+{
+       struct kvmppc_sr *sre;
+
+       sre = &to_book3s(vcpu)->sr[srnum];
+
+       /* Flush any left-over shadows from the previous SR */
+
+       /* XXX Not necessary? */
+       /* kvmppc_mmu_pte_flush(vcpu, ((u64)sre->vsid) << 28, 0xf0000000ULL); */
+
+       /* And then put in the new SR */
+       sre->raw = value;
+       sre->vsid = (value & 0x0fffffff);
+       sre->Ks = (value & 0x40000000) ? true : false;
+       sre->Kp = (value & 0x20000000) ? true : false;
+       sre->nx = (value & 0x10000000) ? true : false;
+
+       /* Map the new segment */
+       kvmppc_mmu_map_segment(vcpu, srnum << SID_SHIFT);
+}
+
+static void kvmppc_mmu_book3s_32_tlbie(struct kvm_vcpu *vcpu, ulong ea, bool large)
+{
+       kvmppc_mmu_pte_flush(vcpu, ea, ~0xFFFULL);
+}
+
+static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, u64 esid,
+                                            u64 *vsid)
+{
+       /* In case we only have one of MSR_IR or MSR_DR set, let's put
+          that in the real-mode context (and hope RM doesn't access
+          high memory) */
+       switch (vcpu->arch.msr & (MSR_DR|MSR_IR)) {
+       case 0:
+               *vsid = (VSID_REAL >> 16) | esid;
+               break;
+       case MSR_IR:
+               *vsid = (VSID_REAL_IR >> 16) | esid;
+               break;
+       case MSR_DR:
+               *vsid = (VSID_REAL_DR >> 16) | esid;
+               break;
+       case MSR_DR|MSR_IR:
+       {
+               ulong ea;
+               ea = esid << SID_SHIFT;
+               *vsid = find_sr(to_book3s(vcpu), ea)->vsid;
+               break;
+       }
+       default:
+               BUG();
+       }
+
+       return 0;
+}
+
+static bool kvmppc_mmu_book3s_32_is_dcbz32(struct kvm_vcpu *vcpu)
+{
+       return true;
+}
+
+
+void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu)
+{
+       struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
+
+       mmu->mtsrin = kvmppc_mmu_book3s_32_mtsrin;
+       mmu->mfsrin = kvmppc_mmu_book3s_32_mfsrin;
+       mmu->xlate = kvmppc_mmu_book3s_32_xlate;
+       mmu->reset_msr = kvmppc_mmu_book3s_32_reset_msr;
+       mmu->tlbie = kvmppc_mmu_book3s_32_tlbie;
+       mmu->esid_to_vsid = kvmppc_mmu_book3s_32_esid_to_vsid;
+       mmu->ea_to_vp = kvmppc_mmu_book3s_32_ea_to_vp;
+       mmu->is_dcbz32 = kvmppc_mmu_book3s_32_is_dcbz32;
+
+       mmu->slbmte = NULL;
+       mmu->slbmfee = NULL;
+       mmu->slbmfev = NULL;
+       mmu->slbie = NULL;
+       mmu->slbia = NULL;
+}
diff --git a/arch/powerpc/kvm/book3s_64_emulate.c b/arch/powerpc/kvm/book3s_64_emulate.c
new file mode 100644 (file)
index 0000000..c343e67
--- /dev/null
@@ -0,0 +1,337 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <asm/kvm_ppc.h>
+#include <asm/disassemble.h>
+#include <asm/kvm_book3s.h>
+#include <asm/reg.h>
+
+#define OP_19_XOP_RFID         18
+#define OP_19_XOP_RFI          50
+
+#define OP_31_XOP_MFMSR                83
+#define OP_31_XOP_MTMSR                146
+#define OP_31_XOP_MTMSRD       178
+#define OP_31_XOP_MTSRIN       242
+#define OP_31_XOP_TLBIEL       274
+#define OP_31_XOP_TLBIE                306
+#define OP_31_XOP_SLBMTE       402
+#define OP_31_XOP_SLBIE                434
+#define OP_31_XOP_SLBIA                498
+#define OP_31_XOP_MFSRIN       659
+#define OP_31_XOP_SLBMFEV      851
+#define OP_31_XOP_EIOIO                854
+#define OP_31_XOP_SLBMFEE      915
+
+/* DCBZ is actually 1014, but we patch it to 1010 so we get a trap */
+#define OP_31_XOP_DCBZ         1010
+
+int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                           unsigned int inst, int *advance)
+{
+       int emulated = EMULATE_DONE;
+
+       switch (get_op(inst)) {
+       case 19:
+               switch (get_xop(inst)) {
+               case OP_19_XOP_RFID:
+               case OP_19_XOP_RFI:
+                       vcpu->arch.pc = vcpu->arch.srr0;
+                       kvmppc_set_msr(vcpu, vcpu->arch.srr1);
+                       *advance = 0;
+                       break;
+
+               default:
+                       emulated = EMULATE_FAIL;
+                       break;
+               }
+               break;
+       case 31:
+               switch (get_xop(inst)) {
+               case OP_31_XOP_MFMSR:
+                       vcpu->arch.gpr[get_rt(inst)] = vcpu->arch.msr;
+                       break;
+               case OP_31_XOP_MTMSRD:
+               {
+                       ulong rs = vcpu->arch.gpr[get_rs(inst)];
+                       if (inst & 0x10000) {
+                               vcpu->arch.msr &= ~(MSR_RI | MSR_EE);
+                               vcpu->arch.msr |= rs & (MSR_RI | MSR_EE);
+                       } else
+                               kvmppc_set_msr(vcpu, rs);
+                       break;
+               }
+               case OP_31_XOP_MTMSR:
+                       kvmppc_set_msr(vcpu, vcpu->arch.gpr[get_rs(inst)]);
+                       break;
+               case OP_31_XOP_MFSRIN:
+               {
+                       int srnum;
+
+                       srnum = (vcpu->arch.gpr[get_rb(inst)] >> 28) & 0xf;
+                       if (vcpu->arch.mmu.mfsrin) {
+                               u32 sr;
+                               sr = vcpu->arch.mmu.mfsrin(vcpu, srnum);
+                               vcpu->arch.gpr[get_rt(inst)] = sr;
+                       }
+                       break;
+               }
+               case OP_31_XOP_MTSRIN:
+                       vcpu->arch.mmu.mtsrin(vcpu,
+                               (vcpu->arch.gpr[get_rb(inst)] >> 28) & 0xf,
+                               vcpu->arch.gpr[get_rs(inst)]);
+                       break;
+               case OP_31_XOP_TLBIE:
+               case OP_31_XOP_TLBIEL:
+               {
+                       bool large = (inst & 0x00200000) ? true : false;
+                       ulong addr = vcpu->arch.gpr[get_rb(inst)];
+                       vcpu->arch.mmu.tlbie(vcpu, addr, large);
+                       break;
+               }
+               case OP_31_XOP_EIOIO:
+                       break;
+               case OP_31_XOP_SLBMTE:
+                       if (!vcpu->arch.mmu.slbmte)
+                               return EMULATE_FAIL;
+
+                       vcpu->arch.mmu.slbmte(vcpu, vcpu->arch.gpr[get_rs(inst)],
+                                               vcpu->arch.gpr[get_rb(inst)]);
+                       break;
+               case OP_31_XOP_SLBIE:
+                       if (!vcpu->arch.mmu.slbie)
+                               return EMULATE_FAIL;
+
+                       vcpu->arch.mmu.slbie(vcpu, vcpu->arch.gpr[get_rb(inst)]);
+                       break;
+               case OP_31_XOP_SLBIA:
+                       if (!vcpu->arch.mmu.slbia)
+                               return EMULATE_FAIL;
+
+                       vcpu->arch.mmu.slbia(vcpu);
+                       break;
+               case OP_31_XOP_SLBMFEE:
+                       if (!vcpu->arch.mmu.slbmfee) {
+                               emulated = EMULATE_FAIL;
+                       } else {
+                               ulong t, rb;
+
+                               rb = vcpu->arch.gpr[get_rb(inst)];
+                               t = vcpu->arch.mmu.slbmfee(vcpu, rb);
+                               vcpu->arch.gpr[get_rt(inst)] = t;
+                       }
+                       break;
+               case OP_31_XOP_SLBMFEV:
+                       if (!vcpu->arch.mmu.slbmfev) {
+                               emulated = EMULATE_FAIL;
+                       } else {
+                               ulong t, rb;
+
+                               rb = vcpu->arch.gpr[get_rb(inst)];
+                               t = vcpu->arch.mmu.slbmfev(vcpu, rb);
+                               vcpu->arch.gpr[get_rt(inst)] = t;
+                       }
+                       break;
+               case OP_31_XOP_DCBZ:
+               {
+                       ulong rb =  vcpu->arch.gpr[get_rb(inst)];
+                       ulong ra = 0;
+                       ulong addr;
+                       u32 zeros[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+                       if (get_ra(inst))
+                               ra = vcpu->arch.gpr[get_ra(inst)];
+
+                       addr = (ra + rb) & ~31ULL;
+                       if (!(vcpu->arch.msr & MSR_SF))
+                               addr &= 0xffffffff;
+
+                       if (kvmppc_st(vcpu, addr, 32, zeros)) {
+                               vcpu->arch.dear = addr;
+                               vcpu->arch.fault_dear = addr;
+                               to_book3s(vcpu)->dsisr = DSISR_PROTFAULT |
+                                                     DSISR_ISSTORE;
+                               kvmppc_book3s_queue_irqprio(vcpu,
+                                       BOOK3S_INTERRUPT_DATA_STORAGE);
+                               kvmppc_mmu_pte_flush(vcpu, addr, ~0xFFFULL);
+                       }
+
+                       break;
+               }
+               default:
+                       emulated = EMULATE_FAIL;
+               }
+               break;
+       default:
+               emulated = EMULATE_FAIL;
+       }
+
+       return emulated;
+}
+
+static void kvmppc_write_bat(struct kvm_vcpu *vcpu, int sprn, u64 val)
+{
+       struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+       struct kvmppc_bat *bat;
+
+       switch (sprn) {
+       case SPRN_IBAT0U ... SPRN_IBAT3L:
+               bat = &vcpu_book3s->ibat[(sprn - SPRN_IBAT0U) / 2];
+               break;
+       case SPRN_IBAT4U ... SPRN_IBAT7L:
+               bat = &vcpu_book3s->ibat[(sprn - SPRN_IBAT4U) / 2];
+               break;
+       case SPRN_DBAT0U ... SPRN_DBAT3L:
+               bat = &vcpu_book3s->dbat[(sprn - SPRN_DBAT0U) / 2];
+               break;
+       case SPRN_DBAT4U ... SPRN_DBAT7L:
+               bat = &vcpu_book3s->dbat[(sprn - SPRN_DBAT4U) / 2];
+               break;
+       default:
+               BUG();
+       }
+
+       if (!(sprn % 2)) {
+               /* Upper BAT */
+               u32 bl = (val >> 2) & 0x7ff;
+               bat->bepi_mask = (~bl << 17);
+               bat->bepi = val & 0xfffe0000;
+               bat->vs = (val & 2) ? 1 : 0;
+               bat->vp = (val & 1) ? 1 : 0;
+       } else {
+               /* Lower BAT */
+               bat->brpn = val & 0xfffe0000;
+               bat->wimg = (val >> 3) & 0xf;
+               bat->pp = val & 3;
+       }
+}
+
+int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
+{
+       int emulated = EMULATE_DONE;
+
+       switch (sprn) {
+       case SPRN_SDR1:
+               to_book3s(vcpu)->sdr1 = vcpu->arch.gpr[rs];
+               break;
+       case SPRN_DSISR:
+               to_book3s(vcpu)->dsisr = vcpu->arch.gpr[rs];
+               break;
+       case SPRN_DAR:
+               vcpu->arch.dear = vcpu->arch.gpr[rs];
+               break;
+       case SPRN_HIOR:
+               to_book3s(vcpu)->hior = vcpu->arch.gpr[rs];
+               break;
+       case SPRN_IBAT0U ... SPRN_IBAT3L:
+       case SPRN_IBAT4U ... SPRN_IBAT7L:
+       case SPRN_DBAT0U ... SPRN_DBAT3L:
+       case SPRN_DBAT4U ... SPRN_DBAT7L:
+               kvmppc_write_bat(vcpu, sprn, vcpu->arch.gpr[rs]);
+               /* BAT writes happen so rarely that we're ok to flush
+                * everything here */
+               kvmppc_mmu_pte_flush(vcpu, 0, 0);
+               break;
+       case SPRN_HID0:
+               to_book3s(vcpu)->hid[0] = vcpu->arch.gpr[rs];
+               break;
+       case SPRN_HID1:
+               to_book3s(vcpu)->hid[1] = vcpu->arch.gpr[rs];
+               break;
+       case SPRN_HID2:
+               to_book3s(vcpu)->hid[2] = vcpu->arch.gpr[rs];
+               break;
+       case SPRN_HID4:
+               to_book3s(vcpu)->hid[4] = vcpu->arch.gpr[rs];
+               break;
+       case SPRN_HID5:
+               to_book3s(vcpu)->hid[5] = vcpu->arch.gpr[rs];
+               /* guest HID5 set can change is_dcbz32 */
+               if (vcpu->arch.mmu.is_dcbz32(vcpu) &&
+                   (mfmsr() & MSR_HV))
+                       vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
+               break;
+       case SPRN_ICTC:
+       case SPRN_THRM1:
+       case SPRN_THRM2:
+       case SPRN_THRM3:
+       case SPRN_CTRLF:
+       case SPRN_CTRLT:
+               break;
+       default:
+               printk(KERN_INFO "KVM: invalid SPR write: %d\n", sprn);
+#ifndef DEBUG_SPR
+               emulated = EMULATE_FAIL;
+#endif
+               break;
+       }
+
+       return emulated;
+}
+
+int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
+{
+       int emulated = EMULATE_DONE;
+
+       switch (sprn) {
+       case SPRN_SDR1:
+               vcpu->arch.gpr[rt] = to_book3s(vcpu)->sdr1;
+               break;
+       case SPRN_DSISR:
+               vcpu->arch.gpr[rt] = to_book3s(vcpu)->dsisr;
+               break;
+       case SPRN_DAR:
+               vcpu->arch.gpr[rt] = vcpu->arch.dear;
+               break;
+       case SPRN_HIOR:
+               vcpu->arch.gpr[rt] = to_book3s(vcpu)->hior;
+               break;
+       case SPRN_HID0:
+               vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[0];
+               break;
+       case SPRN_HID1:
+               vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[1];
+               break;
+       case SPRN_HID2:
+               vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[2];
+               break;
+       case SPRN_HID4:
+               vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[4];
+               break;
+       case SPRN_HID5:
+               vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[5];
+               break;
+       case SPRN_THRM1:
+       case SPRN_THRM2:
+       case SPRN_THRM3:
+       case SPRN_CTRLF:
+       case SPRN_CTRLT:
+               vcpu->arch.gpr[rt] = 0;
+               break;
+       default:
+               printk(KERN_INFO "KVM: invalid SPR read: %d\n", sprn);
+#ifndef DEBUG_SPR
+               emulated = EMULATE_FAIL;
+#endif
+               break;
+       }
+
+       return emulated;
+}
+
diff --git a/arch/powerpc/kvm/book3s_64_exports.c b/arch/powerpc/kvm/book3s_64_exports.c
new file mode 100644 (file)
index 0000000..5b2db38
--- /dev/null
@@ -0,0 +1,24 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <linux/module.h>
+#include <asm/kvm_book3s.h>
+
+EXPORT_SYMBOL_GPL(kvmppc_trampoline_enter);
+EXPORT_SYMBOL_GPL(kvmppc_trampoline_lowmem);
diff --git a/arch/powerpc/kvm/book3s_64_interrupts.S b/arch/powerpc/kvm/book3s_64_interrupts.S
new file mode 100644 (file)
index 0000000..7b55d80
--- /dev/null
@@ -0,0 +1,392 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/asm-offsets.h>
+#include <asm/exception-64s.h>
+
+#define KVMPPC_HANDLE_EXIT .kvmppc_handle_exit
+#define ULONG_SIZE 8
+#define VCPU_GPR(n)     (VCPU_GPRS + (n * ULONG_SIZE))
+
+.macro mfpaca tmp_reg, src_reg, offset, vcpu_reg
+       ld      \tmp_reg, (PACA_EXMC+\offset)(r13)
+       std     \tmp_reg, VCPU_GPR(\src_reg)(\vcpu_reg)
+.endm
+
+.macro DISABLE_INTERRUPTS
+       mfmsr   r0
+       rldicl  r0,r0,48,1
+       rotldi  r0,r0,16
+       mtmsrd  r0,1
+.endm
+
+/*****************************************************************************
+ *                                                                           *
+ *     Guest entry / exit code that is in kernel module memory (highmem)     *
+ *                                                                           *
+ ****************************************************************************/
+
+/* Registers:
+ *  r3: kvm_run pointer
+ *  r4: vcpu pointer
+ */
+_GLOBAL(__kvmppc_vcpu_entry)
+
+kvm_start_entry:
+       /* Write correct stack frame */
+       mflr    r0
+       std     r0,16(r1)
+
+       /* Save host state to the stack */
+       stdu    r1, -SWITCH_FRAME_SIZE(r1)
+
+       /* Save r3 (kvm_run) and r4 (vcpu) */
+       SAVE_2GPRS(3, r1)
+
+       /* Save non-volatile registers (r14 - r31) */
+       SAVE_NVGPRS(r1)
+
+       /* Save LR */
+       mflr    r14
+       std     r14, _LINK(r1)
+
+/* XXX optimize non-volatile loading away */
+kvm_start_lightweight:
+
+       DISABLE_INTERRUPTS
+
+       /* Save R1/R2 in the PACA */
+       std     r1, PACAR1(r13)
+       std     r2, (PACA_EXMC+EX_SRR0)(r13)
+       ld      r3, VCPU_HIGHMEM_HANDLER(r4)
+       std     r3, PACASAVEDMSR(r13)
+
+       /* Load non-volatile guest state from the vcpu */
+       ld      r14, VCPU_GPR(r14)(r4)
+       ld      r15, VCPU_GPR(r15)(r4)
+       ld      r16, VCPU_GPR(r16)(r4)
+       ld      r17, VCPU_GPR(r17)(r4)
+       ld      r18, VCPU_GPR(r18)(r4)
+       ld      r19, VCPU_GPR(r19)(r4)
+       ld      r20, VCPU_GPR(r20)(r4)
+       ld      r21, VCPU_GPR(r21)(r4)
+       ld      r22, VCPU_GPR(r22)(r4)
+       ld      r23, VCPU_GPR(r23)(r4)
+       ld      r24, VCPU_GPR(r24)(r4)
+       ld      r25, VCPU_GPR(r25)(r4)
+       ld      r26, VCPU_GPR(r26)(r4)
+       ld      r27, VCPU_GPR(r27)(r4)
+       ld      r28, VCPU_GPR(r28)(r4)
+       ld      r29, VCPU_GPR(r29)(r4)
+       ld      r30, VCPU_GPR(r30)(r4)
+       ld      r31, VCPU_GPR(r31)(r4)
+
+       ld      r9, VCPU_PC(r4)                 /* r9 = vcpu->arch.pc */
+       ld      r10, VCPU_SHADOW_MSR(r4)        /* r10 = vcpu->arch.shadow_msr */
+
+       ld      r3, VCPU_TRAMPOLINE_ENTER(r4)
+       mtsrr0  r3
+
+       LOAD_REG_IMMEDIATE(r3, MSR_KERNEL & ~(MSR_IR | MSR_DR))
+       mtsrr1  r3
+
+       /* Load guest state in the respective registers */
+       lwz     r3, VCPU_CR(r4)         /* r3 = vcpu->arch.cr */
+       stw     r3, (PACA_EXMC + EX_CCR)(r13)
+
+       ld      r3, VCPU_CTR(r4)        /* r3 = vcpu->arch.ctr */
+       mtctr   r3                      /* CTR = r3 */
+
+       ld      r3, VCPU_LR(r4)         /* r3 = vcpu->arch.lr */
+       mtlr    r3                      /* LR = r3 */
+
+       ld      r3, VCPU_XER(r4)        /* r3 = vcpu->arch.xer */
+       std     r3, (PACA_EXMC + EX_R3)(r13)
+
+       /* Some guests may need to have dcbz set to 32 byte length.
+        *
+        * Usually we ensure that by patching the guest's instructions
+        * to trap on dcbz and emulate it in the hypervisor.
+        *
+        * If we can, we should tell the CPU to use 32 byte dcbz though,
+        * because that's a lot faster.
+        */
+
+       ld      r3, VCPU_HFLAGS(r4)
+       rldicl. r3, r3, 0, 63           /* CR = ((r3 & 1) == 0) */
+       beq     no_dcbz32_on
+
+       mfspr   r3,SPRN_HID5
+       ori     r3, r3, 0x80            /* XXX HID5_dcbz32 = 0x80 */
+       mtspr   SPRN_HID5,r3
+
+no_dcbz32_on:
+       /*      Load guest GPRs */
+
+       ld      r3, VCPU_GPR(r9)(r4)
+       std     r3, (PACA_EXMC + EX_R9)(r13)
+       ld      r3, VCPU_GPR(r10)(r4)
+       std     r3, (PACA_EXMC + EX_R10)(r13)
+       ld      r3, VCPU_GPR(r11)(r4)
+       std     r3, (PACA_EXMC + EX_R11)(r13)
+       ld      r3, VCPU_GPR(r12)(r4)
+       std     r3, (PACA_EXMC + EX_R12)(r13)
+       ld      r3, VCPU_GPR(r13)(r4)
+       std     r3, (PACA_EXMC + EX_R13)(r13)
+
+       ld      r0, VCPU_GPR(r0)(r4)
+       ld      r1, VCPU_GPR(r1)(r4)
+       ld      r2, VCPU_GPR(r2)(r4)
+       ld      r3, VCPU_GPR(r3)(r4)
+       ld      r5, VCPU_GPR(r5)(r4)
+       ld      r6, VCPU_GPR(r6)(r4)
+       ld      r7, VCPU_GPR(r7)(r4)
+       ld      r8, VCPU_GPR(r8)(r4)
+       ld      r4, VCPU_GPR(r4)(r4)
+
+       /* This sets the Magic value for the trampoline */
+
+       li      r11, 1
+       stb     r11, PACA_KVM_IN_GUEST(r13)
+
+       /* Jump to SLB patching handlder and into our guest */
+       RFI
+
+/*
+ * This is the handler in module memory. It gets jumped at from the
+ * lowmem trampoline code, so it's basically the guest exit code.
+ *
+ */
+
+.global kvmppc_handler_highmem
+kvmppc_handler_highmem:
+
+       /*
+        * Register usage at this point:
+        *
+        * R00   = guest R13
+        * R01   = host R1
+        * R02   = host R2
+        * R10   = guest PC
+        * R11   = guest MSR
+        * R12   = exit handler id
+        * R13   = PACA
+        * PACA.exmc.R9    = guest R1
+        * PACA.exmc.R10   = guest R10
+        * PACA.exmc.R11   = guest R11
+        * PACA.exmc.R12   = guest R12
+        * PACA.exmc.R13   = guest R2
+        * PACA.exmc.DAR   = guest DAR
+        * PACA.exmc.DSISR = guest DSISR
+        * PACA.exmc.LR    = guest instruction
+        * PACA.exmc.CCR   = guest CR
+        * PACA.exmc.SRR0  = guest R0
+        *
+        */
+
+       std     r3, (PACA_EXMC+EX_R3)(r13)
+
+       /* save the exit id in R3 */
+       mr      r3, r12
+
+       /* R12 = vcpu */
+       ld      r12, GPR4(r1)
+
+       /* Now save the guest state */
+
+       std     r0, VCPU_GPR(r13)(r12)
+       std     r4, VCPU_GPR(r4)(r12)
+       std     r5, VCPU_GPR(r5)(r12)
+       std     r6, VCPU_GPR(r6)(r12)
+       std     r7, VCPU_GPR(r7)(r12)
+       std     r8, VCPU_GPR(r8)(r12)
+       std     r9, VCPU_GPR(r9)(r12)
+
+       /* get registers from PACA */
+       mfpaca  r5, r0, EX_SRR0, r12
+       mfpaca  r5, r3, EX_R3, r12
+       mfpaca  r5, r1, EX_R9, r12
+       mfpaca  r5, r10, EX_R10, r12
+       mfpaca  r5, r11, EX_R11, r12
+       mfpaca  r5, r12, EX_R12, r12
+       mfpaca  r5, r2, EX_R13, r12
+
+       lwz     r5, (PACA_EXMC+EX_LR)(r13)
+       stw     r5, VCPU_LAST_INST(r12)
+
+       lwz     r5, (PACA_EXMC+EX_CCR)(r13)
+       stw     r5, VCPU_CR(r12)
+
+       ld      r5, VCPU_HFLAGS(r12)
+       rldicl. r5, r5, 0, 63           /* CR = ((r5 & 1) == 0) */
+       beq     no_dcbz32_off
+
+       mfspr   r5,SPRN_HID5
+       rldimi  r5,r5,6,56
+       mtspr   SPRN_HID5,r5
+
+no_dcbz32_off:
+
+       /* XXX maybe skip on lightweight? */
+       std     r14, VCPU_GPR(r14)(r12)
+       std     r15, VCPU_GPR(r15)(r12)
+       std     r16, VCPU_GPR(r16)(r12)
+       std     r17, VCPU_GPR(r17)(r12)
+       std     r18, VCPU_GPR(r18)(r12)
+       std     r19, VCPU_GPR(r19)(r12)
+       std     r20, VCPU_GPR(r20)(r12)
+       std     r21, VCPU_GPR(r21)(r12)
+       std     r22, VCPU_GPR(r22)(r12)
+       std     r23, VCPU_GPR(r23)(r12)
+       std     r24, VCPU_GPR(r24)(r12)
+       std     r25, VCPU_GPR(r25)(r12)
+       std     r26, VCPU_GPR(r26)(r12)
+       std     r27, VCPU_GPR(r27)(r12)
+       std     r28, VCPU_GPR(r28)(r12)
+       std     r29, VCPU_GPR(r29)(r12)
+       std     r30, VCPU_GPR(r30)(r12)
+       std     r31, VCPU_GPR(r31)(r12)
+
+       /* Restore non-volatile host registers (r14 - r31) */
+       REST_NVGPRS(r1)
+
+       /* Save guest PC (R10) */
+       std     r10, VCPU_PC(r12)
+
+       /* Save guest msr (R11) */
+       std     r11, VCPU_SHADOW_MSR(r12)
+
+       /* Save guest CTR (in R12) */
+       mfctr   r5
+       std     r5, VCPU_CTR(r12)
+
+       /* Save guest LR */
+       mflr    r5
+       std     r5, VCPU_LR(r12)
+
+       /* Save guest XER */
+       mfxer   r5
+       std     r5, VCPU_XER(r12)
+
+       /* Save guest DAR */
+       ld      r5, (PACA_EXMC+EX_DAR)(r13)
+       std     r5, VCPU_FAULT_DEAR(r12)
+
+       /* Save guest DSISR */
+       lwz     r5, (PACA_EXMC+EX_DSISR)(r13)
+       std     r5, VCPU_FAULT_DSISR(r12)
+
+       /* Restore host msr -> SRR1 */
+       ld      r7, VCPU_HOST_MSR(r12)
+       mtsrr1  r7
+
+       /* Restore host IP -> SRR0 */
+       ld      r6, VCPU_HOST_RETIP(r12)
+       mtsrr0  r6
+
+       /*
+        * For some interrupts, we need to call the real Linux
+        * handler, so it can do work for us. This has to happen
+        * as if the interrupt arrived from the kernel though,
+        * so let's fake it here where most state is restored.
+        *
+        * Call Linux for hardware interrupts/decrementer
+        * r3 = address of interrupt handler (exit reason)
+        */
+
+       cmpwi   r3, BOOK3S_INTERRUPT_EXTERNAL
+       beq     call_linux_handler
+       cmpwi   r3, BOOK3S_INTERRUPT_DECREMENTER
+       beq     call_linux_handler
+
+       /* Back to Interruptable Mode! (goto kvm_return_point) */
+       RFI
+
+call_linux_handler:
+
+       /*
+        * If we land here we need to jump back to the handler we
+        * came from.
+        *
+        * We have a page that we can access from real mode, so let's
+        * jump back to that and use it as a trampoline to get back into the
+        * interrupt handler!
+        *
+        * R3 still contains the exit code,
+        * R6 VCPU_HOST_RETIP and
+        * R7 VCPU_HOST_MSR
+        */
+
+       mtlr    r3
+
+       ld      r5, VCPU_TRAMPOLINE_LOWMEM(r12)
+       mtsrr0  r5
+       LOAD_REG_IMMEDIATE(r5, MSR_KERNEL & ~(MSR_IR | MSR_DR))
+       mtsrr1  r5
+
+       RFI
+
+.global kvm_return_point
+kvm_return_point:
+
+       /* Jump back to lightweight entry if we're supposed to */
+       /* go back into the guest */
+       mr      r5, r3
+       /* Restore r3 (kvm_run) and r4 (vcpu) */
+       REST_2GPRS(3, r1)
+       bl      KVMPPC_HANDLE_EXIT
+
+#if 0 /* XXX get lightweight exits back */
+       cmpwi   r3, RESUME_GUEST
+       bne     kvm_exit_heavyweight
+
+       /* put VCPU and KVM_RUN back into place and roll again! */
+       REST_2GPRS(3, r1)
+       b       kvm_start_lightweight
+
+kvm_exit_heavyweight:
+       /* Restore non-volatile host registers */
+       ld      r14, _LINK(r1)
+       mtlr    r14
+       REST_NVGPRS(r1)
+
+       addi    r1, r1, SWITCH_FRAME_SIZE
+#else
+       ld      r4, _LINK(r1)
+       mtlr    r4
+
+       cmpwi   r3, RESUME_GUEST
+       bne     kvm_exit_heavyweight
+
+       REST_2GPRS(3, r1)
+
+       addi    r1, r1, SWITCH_FRAME_SIZE
+
+       b       kvm_start_entry
+
+kvm_exit_heavyweight:
+
+       addi    r1, r1, SWITCH_FRAME_SIZE
+#endif
+
+       blr
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
new file mode 100644 (file)
index 0000000..a31f9c6
--- /dev/null
@@ -0,0 +1,476 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/highmem.h>
+
+#include <asm/tlbflush.h>
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+
+/* #define DEBUG_MMU */
+
+#ifdef DEBUG_MMU
+#define dprintk(X...) printk(KERN_INFO X)
+#else
+#define dprintk(X...) do { } while(0)
+#endif
+
+static void kvmppc_mmu_book3s_64_reset_msr(struct kvm_vcpu *vcpu)
+{
+       kvmppc_set_msr(vcpu, MSR_SF);
+}
+
+static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe(
+                               struct kvmppc_vcpu_book3s *vcpu_book3s,
+                               gva_t eaddr)
+{
+       int i;
+       u64 esid = GET_ESID(eaddr);
+       u64 esid_1t = GET_ESID_1T(eaddr);
+
+       for (i = 0; i < vcpu_book3s->slb_nr; i++) {
+               u64 cmp_esid = esid;
+
+               if (!vcpu_book3s->slb[i].valid)
+                       continue;
+
+               if (vcpu_book3s->slb[i].large)
+                       cmp_esid = esid_1t;
+
+               if (vcpu_book3s->slb[i].esid == cmp_esid)
+                       return &vcpu_book3s->slb[i];
+       }
+
+       dprintk("KVM: No SLB entry found for 0x%lx [%llx | %llx]\n",
+               eaddr, esid, esid_1t);
+       for (i = 0; i < vcpu_book3s->slb_nr; i++) {
+           if (vcpu_book3s->slb[i].vsid)
+               dprintk("  %d: %c%c %llx %llx\n", i,
+                       vcpu_book3s->slb[i].valid ? 'v' : ' ',
+                       vcpu_book3s->slb[i].large ? 'l' : ' ',
+                       vcpu_book3s->slb[i].esid,
+                       vcpu_book3s->slb[i].vsid);
+       }
+
+       return NULL;
+}
+
+static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr,
+                                        bool data)
+{
+       struct kvmppc_slb *slb;
+
+       slb = kvmppc_mmu_book3s_64_find_slbe(to_book3s(vcpu), eaddr);
+       if (!slb)
+               return 0;
+
+       if (slb->large)
+               return (((u64)eaddr >> 12) & 0xfffffff) |
+                      (((u64)slb->vsid) << 28);
+
+       return (((u64)eaddr >> 12) & 0xffff) | (((u64)slb->vsid) << 16);
+}
+
+static int kvmppc_mmu_book3s_64_get_pagesize(struct kvmppc_slb *slbe)
+{
+       return slbe->large ? 24 : 12;
+}
+
+static u32 kvmppc_mmu_book3s_64_get_page(struct kvmppc_slb *slbe, gva_t eaddr)
+{
+       int p = kvmppc_mmu_book3s_64_get_pagesize(slbe);
+       return ((eaddr & 0xfffffff) >> p);
+}
+
+static hva_t kvmppc_mmu_book3s_64_get_pteg(
+                               struct kvmppc_vcpu_book3s *vcpu_book3s,
+                               struct kvmppc_slb *slbe, gva_t eaddr,
+                               bool second)
+{
+       u64 hash, pteg, htabsize;
+       u32 page;
+       hva_t r;
+
+       page = kvmppc_mmu_book3s_64_get_page(slbe, eaddr);
+       htabsize = ((1 << ((vcpu_book3s->sdr1 & 0x1f) + 11)) - 1);
+
+       hash = slbe->vsid ^ page;
+       if (second)
+               hash = ~hash;
+       hash &= ((1ULL << 39ULL) - 1ULL);
+       hash &= htabsize;
+       hash <<= 7ULL;
+
+       pteg = vcpu_book3s->sdr1 & 0xfffffffffffc0000ULL;
+       pteg |= hash;
+
+       dprintk("MMU: page=0x%x sdr1=0x%llx pteg=0x%llx vsid=0x%llx\n",
+               page, vcpu_book3s->sdr1, pteg, slbe->vsid);
+
+       r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT);
+       if (kvm_is_error_hva(r))
+               return r;
+       return r | (pteg & ~PAGE_MASK);
+}
+
+static u64 kvmppc_mmu_book3s_64_get_avpn(struct kvmppc_slb *slbe, gva_t eaddr)
+{
+       int p = kvmppc_mmu_book3s_64_get_pagesize(slbe);
+       u64 avpn;
+
+       avpn = kvmppc_mmu_book3s_64_get_page(slbe, eaddr);
+       avpn |= slbe->vsid << (28 - p);
+
+       if (p < 24)
+               avpn >>= ((80 - p) - 56) - 8;
+       else
+               avpn <<= 8;
+
+       return avpn;
+}
+
+static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
+                               struct kvmppc_pte *gpte, bool data)
+{
+       struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+       struct kvmppc_slb *slbe;
+       hva_t ptegp;
+       u64 pteg[16];
+       u64 avpn = 0;
+       int i;
+       u8 key = 0;
+       bool found = false;
+       bool perm_err = false;
+       int second = 0;
+
+       slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu_book3s, eaddr);
+       if (!slbe)
+               goto no_seg_found;
+
+do_second:
+       ptegp = kvmppc_mmu_book3s_64_get_pteg(vcpu_book3s, slbe, eaddr, second);
+       if (kvm_is_error_hva(ptegp))
+               goto no_page_found;
+
+       avpn = kvmppc_mmu_book3s_64_get_avpn(slbe, eaddr);
+
+       if(copy_from_user(pteg, (void __user *)ptegp, sizeof(pteg))) {
+               printk(KERN_ERR "KVM can't copy data from 0x%lx!\n", ptegp);
+               goto no_page_found;
+       }
+
+       if ((vcpu->arch.msr & MSR_PR) && slbe->Kp)
+               key = 4;
+       else if (!(vcpu->arch.msr & MSR_PR) && slbe->Ks)
+               key = 4;
+
+       for (i=0; i<16; i+=2) {
+               u64 v = pteg[i];
+               u64 r = pteg[i+1];
+
+               /* Valid check */
+               if (!(v & HPTE_V_VALID))
+                       continue;
+               /* Hash check */
+               if ((v & HPTE_V_SECONDARY) != second)
+                       continue;
+
+               /* AVPN compare */
+               if (HPTE_V_AVPN_VAL(avpn) == HPTE_V_AVPN_VAL(v)) {
+                       u8 pp = (r & HPTE_R_PP) | key;
+                       int eaddr_mask = 0xFFF;
+
+                       gpte->eaddr = eaddr;
+                       gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu,
+                                                                   eaddr,
+                                                                   data);
+                       if (slbe->large)
+                               eaddr_mask = 0xFFFFFF;
+                       gpte->raddr = (r & HPTE_R_RPN) | (eaddr & eaddr_mask);
+                       gpte->may_execute = ((r & HPTE_R_N) ? false : true);
+                       gpte->may_read = false;
+                       gpte->may_write = false;
+
+                       switch (pp) {
+                       case 0:
+                       case 1:
+                       case 2:
+                       case 6:
+                               gpte->may_write = true;
+                               /* fall through */
+                       case 3:
+                       case 5:
+                       case 7:
+                               gpte->may_read = true;
+                               break;
+                       }
+
+                       if (!gpte->may_read) {
+                               perm_err = true;
+                               continue;
+                       }
+
+                       dprintk("KVM MMU: Translated 0x%lx [0x%llx] -> 0x%llx "
+                               "-> 0x%llx\n",
+                               eaddr, avpn, gpte->vpage, gpte->raddr);
+                       found = true;
+                       break;
+               }
+       }
+
+       /* Update PTE R and C bits, so the guest's swapper knows we used the
+        * page */
+       if (found) {
+               u32 oldr = pteg[i+1];
+
+               if (gpte->may_read) {
+                       /* Set the accessed flag */
+                       pteg[i+1] |= HPTE_R_R;
+               }
+               if (gpte->may_write) {
+                       /* Set the dirty flag */
+                       pteg[i+1] |= HPTE_R_C;
+               } else {
+                       dprintk("KVM: Mapping read-only page!\n");
+               }
+
+               /* Write back into the PTEG */
+               if (pteg[i+1] != oldr)
+                       copy_to_user((void __user *)ptegp, pteg, sizeof(pteg));
+
+               return 0;
+       } else {
+               dprintk("KVM MMU: No PTE found (ea=0x%lx sdr1=0x%llx "
+                       "ptegp=0x%lx)\n",
+                       eaddr, to_book3s(vcpu)->sdr1, ptegp);
+               for (i = 0; i < 16; i += 2)
+                       dprintk("   %02d: 0x%llx - 0x%llx (0x%llx)\n",
+                               i, pteg[i], pteg[i+1], avpn);
+
+               if (!second) {
+                       second = HPTE_V_SECONDARY;
+                       goto do_second;
+               }
+       }
+
+
+no_page_found:
+
+
+       if (perm_err)
+               return -EPERM;
+
+       return -ENOENT;
+
+no_seg_found:
+
+       dprintk("KVM MMU: Trigger segment fault\n");
+       return -EINVAL;
+}
+
+static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb)
+{
+       struct kvmppc_vcpu_book3s *vcpu_book3s;
+       u64 esid, esid_1t;
+       int slb_nr;
+       struct kvmppc_slb *slbe;
+
+       dprintk("KVM MMU: slbmte(0x%llx, 0x%llx)\n", rs, rb);
+
+       vcpu_book3s = to_book3s(vcpu);
+
+       esid = GET_ESID(rb);
+       esid_1t = GET_ESID_1T(rb);
+       slb_nr = rb & 0xfff;
+
+       if (slb_nr > vcpu_book3s->slb_nr)
+               return;
+
+       slbe = &vcpu_book3s->slb[slb_nr];
+
+       slbe->large = (rs & SLB_VSID_L) ? 1 : 0;
+       slbe->esid  = slbe->large ? esid_1t : esid;
+       slbe->vsid  = rs >> 12;
+       slbe->valid = (rb & SLB_ESID_V) ? 1 : 0;
+       slbe->Ks    = (rs & SLB_VSID_KS) ? 1 : 0;
+       slbe->Kp    = (rs & SLB_VSID_KP) ? 1 : 0;
+       slbe->nx    = (rs & SLB_VSID_N) ? 1 : 0;
+       slbe->class = (rs & SLB_VSID_C) ? 1 : 0;
+
+       slbe->orige = rb & (ESID_MASK | SLB_ESID_V);
+       slbe->origv = rs;
+
+       /* Map the new segment */
+       kvmppc_mmu_map_segment(vcpu, esid << SID_SHIFT);
+}
+
+static u64 kvmppc_mmu_book3s_64_slbmfee(struct kvm_vcpu *vcpu, u64 slb_nr)
+{
+       struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+       struct kvmppc_slb *slbe;
+
+       if (slb_nr > vcpu_book3s->slb_nr)
+               return 0;
+
+       slbe = &vcpu_book3s->slb[slb_nr];
+
+       return slbe->orige;
+}
+
+static u64 kvmppc_mmu_book3s_64_slbmfev(struct kvm_vcpu *vcpu, u64 slb_nr)
+{
+       struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+       struct kvmppc_slb *slbe;
+
+       if (slb_nr > vcpu_book3s->slb_nr)
+               return 0;
+
+       slbe = &vcpu_book3s->slb[slb_nr];
+
+       return slbe->origv;
+}
+
+static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea)
+{
+       struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+       struct kvmppc_slb *slbe;
+
+       dprintk("KVM MMU: slbie(0x%llx)\n", ea);
+
+       slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu_book3s, ea);
+
+       if (!slbe)
+               return;
+
+       dprintk("KVM MMU: slbie(0x%llx, 0x%llx)\n", ea, slbe->esid);
+
+       slbe->valid = false;
+
+       kvmppc_mmu_map_segment(vcpu, ea);
+}
+
+static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu)
+{
+       struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+       int i;
+
+       dprintk("KVM MMU: slbia()\n");
+
+       for (i = 1; i < vcpu_book3s->slb_nr; i++)
+               vcpu_book3s->slb[i].valid = false;
+
+       if (vcpu->arch.msr & MSR_IR) {
+               kvmppc_mmu_flush_segments(vcpu);
+               kvmppc_mmu_map_segment(vcpu, vcpu->arch.pc);
+       }
+}
+
+static void kvmppc_mmu_book3s_64_mtsrin(struct kvm_vcpu *vcpu, u32 srnum,
+                                       ulong value)
+{
+       u64 rb = 0, rs = 0;
+
+       /* ESID = srnum */
+       rb |= (srnum & 0xf) << 28;
+       /* Set the valid bit */
+       rb |= 1 << 27;
+       /* Index = ESID */
+       rb |= srnum;
+
+       /* VSID = VSID */
+       rs |= (value & 0xfffffff) << 12;
+       /* flags = flags */
+       rs |= ((value >> 27) & 0xf) << 9;
+
+       kvmppc_mmu_book3s_64_slbmte(vcpu, rs, rb);
+}
+
+static void kvmppc_mmu_book3s_64_tlbie(struct kvm_vcpu *vcpu, ulong va,
+                                      bool large)
+{
+       u64 mask = 0xFFFFFFFFFULL;
+
+       dprintk("KVM MMU: tlbie(0x%lx)\n", va);
+
+       if (large)
+               mask = 0xFFFFFF000ULL;
+       kvmppc_mmu_pte_vflush(vcpu, va >> 12, mask);
+}
+
+static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, u64 esid,
+                                            u64 *vsid)
+{
+       switch (vcpu->arch.msr & (MSR_DR|MSR_IR)) {
+       case 0:
+               *vsid = (VSID_REAL >> 16) | esid;
+               break;
+       case MSR_IR:
+               *vsid = (VSID_REAL_IR >> 16) | esid;
+               break;
+       case MSR_DR:
+               *vsid = (VSID_REAL_DR >> 16) | esid;
+               break;
+       case MSR_DR|MSR_IR:
+       {
+               ulong ea;
+               struct kvmppc_slb *slb;
+               ea = esid << SID_SHIFT;
+               slb = kvmppc_mmu_book3s_64_find_slbe(to_book3s(vcpu), ea);
+               if (slb)
+                       *vsid = slb->vsid;
+               else
+                       return -ENOENT;
+
+               break;
+       }
+       default:
+               BUG();
+               break;
+       }
+
+       return 0;
+}
+
+static bool kvmppc_mmu_book3s_64_is_dcbz32(struct kvm_vcpu *vcpu)
+{
+       return (to_book3s(vcpu)->hid[5] & 0x80);
+}
+
+void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu)
+{
+       struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
+
+       mmu->mfsrin = NULL;
+       mmu->mtsrin = kvmppc_mmu_book3s_64_mtsrin;
+       mmu->slbmte = kvmppc_mmu_book3s_64_slbmte;
+       mmu->slbmfee = kvmppc_mmu_book3s_64_slbmfee;
+       mmu->slbmfev = kvmppc_mmu_book3s_64_slbmfev;
+       mmu->slbie = kvmppc_mmu_book3s_64_slbie;
+       mmu->slbia = kvmppc_mmu_book3s_64_slbia;
+       mmu->xlate = kvmppc_mmu_book3s_64_xlate;
+       mmu->reset_msr = kvmppc_mmu_book3s_64_reset_msr;
+       mmu->tlbie = kvmppc_mmu_book3s_64_tlbie;
+       mmu->esid_to_vsid = kvmppc_mmu_book3s_64_esid_to_vsid;
+       mmu->ea_to_vp = kvmppc_mmu_book3s_64_ea_to_vp;
+       mmu->is_dcbz32 = kvmppc_mmu_book3s_64_is_dcbz32;
+}
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
new file mode 100644 (file)
index 0000000..f2899b2
--- /dev/null
@@ -0,0 +1,408 @@
+/*
+ * Copyright (C) 2009 SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ *     Alexander Graf <agraf@suse.de>
+ *     Kevin Wolf <mail@kevin-wolf.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu-hash64.h>
+#include <asm/machdep.h>
+#include <asm/mmu_context.h>
+#include <asm/hw_irq.h>
+
+#define PTE_SIZE 12
+#define VSID_ALL 0
+
+/* #define DEBUG_MMU */
+/* #define DEBUG_SLB */
+
+#ifdef DEBUG_MMU
+#define dprintk_mmu(a, ...) printk(KERN_INFO a, __VA_ARGS__)
+#else
+#define dprintk_mmu(a, ...) do { } while(0)
+#endif
+
+#ifdef DEBUG_SLB
+#define dprintk_slb(a, ...) printk(KERN_INFO a, __VA_ARGS__)
+#else
+#define dprintk_slb(a, ...) do { } while(0)
+#endif
+
+static void invalidate_pte(struct hpte_cache *pte)
+{
+       dprintk_mmu("KVM: Flushing SPT %d: 0x%llx (0x%llx) -> 0x%llx\n",
+                   i, pte->pte.eaddr, pte->pte.vpage, pte->host_va);
+
+       ppc_md.hpte_invalidate(pte->slot, pte->host_va,
+                              MMU_PAGE_4K, MMU_SEGSIZE_256M,
+                              false);
+       pte->host_va = 0;
+       kvm_release_pfn_dirty(pte->pfn);
+}
+
+void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, u64 guest_ea, u64 ea_mask)
+{
+       int i;
+
+       dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%llx & 0x%llx\n",
+                   vcpu->arch.hpte_cache_offset, guest_ea, ea_mask);
+       BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
+
+       guest_ea &= ea_mask;
+       for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
+               struct hpte_cache *pte;
+
+               pte = &vcpu->arch.hpte_cache[i];
+               if (!pte->host_va)
+                       continue;
+
+               if ((pte->pte.eaddr & ea_mask) == guest_ea) {
+                       invalidate_pte(pte);
+               }
+       }
+
+       /* Doing a complete flush -> start from scratch */
+       if (!ea_mask)
+               vcpu->arch.hpte_cache_offset = 0;
+}
+
+void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
+{
+       int i;
+
+       dprintk_mmu("KVM: Flushing %d Shadow vPTEs: 0x%llx & 0x%llx\n",
+                   vcpu->arch.hpte_cache_offset, guest_vp, vp_mask);
+       BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
+
+       guest_vp &= vp_mask;
+       for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
+               struct hpte_cache *pte;
+
+               pte = &vcpu->arch.hpte_cache[i];
+               if (!pte->host_va)
+                       continue;
+
+               if ((pte->pte.vpage & vp_mask) == guest_vp) {
+                       invalidate_pte(pte);
+               }
+       }
+}
+
+void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, u64 pa_start, u64 pa_end)
+{
+       int i;
+
+       dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%llx & 0x%llx\n",
+                   vcpu->arch.hpte_cache_offset, guest_pa, pa_mask);
+       BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
+
+       for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
+               struct hpte_cache *pte;
+
+               pte = &vcpu->arch.hpte_cache[i];
+               if (!pte->host_va)
+                       continue;
+
+               if ((pte->pte.raddr >= pa_start) &&
+                   (pte->pte.raddr < pa_end)) {
+                       invalidate_pte(pte);
+               }
+       }
+}
+
+struct kvmppc_pte *kvmppc_mmu_find_pte(struct kvm_vcpu *vcpu, u64 ea, bool data)
+{
+       int i;
+       u64 guest_vp;
+
+       guest_vp = vcpu->arch.mmu.ea_to_vp(vcpu, ea, false);
+       for (i=0; i<vcpu->arch.hpte_cache_offset; i++) {
+               struct hpte_cache *pte;
+
+               pte = &vcpu->arch.hpte_cache[i];
+               if (!pte->host_va)
+                       continue;
+
+               if (pte->pte.vpage == guest_vp)
+                       return &pte->pte;
+       }
+
+       return NULL;
+}
+
+static int kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
+{
+       if (vcpu->arch.hpte_cache_offset == HPTEG_CACHE_NUM)
+               kvmppc_mmu_pte_flush(vcpu, 0, 0);
+
+       return vcpu->arch.hpte_cache_offset++;
+}
+
+/* We keep 512 gvsid->hvsid entries, mapping the guest ones to the array using
+ * a hash, so we don't waste cycles on looping */
+static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid)
+{
+       return (u16)(((gvsid >> (SID_MAP_BITS * 7)) & SID_MAP_MASK) ^
+                    ((gvsid >> (SID_MAP_BITS * 6)) & SID_MAP_MASK) ^
+                    ((gvsid >> (SID_MAP_BITS * 5)) & SID_MAP_MASK) ^
+                    ((gvsid >> (SID_MAP_BITS * 4)) & SID_MAP_MASK) ^
+                    ((gvsid >> (SID_MAP_BITS * 3)) & SID_MAP_MASK) ^
+                    ((gvsid >> (SID_MAP_BITS * 2)) & SID_MAP_MASK) ^
+                    ((gvsid >> (SID_MAP_BITS * 1)) & SID_MAP_MASK) ^
+                    ((gvsid >> (SID_MAP_BITS * 0)) & SID_MAP_MASK));
+}
+
+
+static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid)
+{
+       struct kvmppc_sid_map *map;
+       u16 sid_map_mask;
+
+       if (vcpu->arch.msr & MSR_PR)
+               gvsid |= VSID_PR;
+
+       sid_map_mask = kvmppc_sid_hash(vcpu, gvsid);
+       map = &to_book3s(vcpu)->sid_map[sid_map_mask];
+       if (map->guest_vsid == gvsid) {
+               dprintk_slb("SLB: Searching 0x%llx -> 0x%llx\n",
+                           gvsid, map->host_vsid);
+               return map;
+       }
+
+       map = &to_book3s(vcpu)->sid_map[SID_MAP_MASK - sid_map_mask];
+       if (map->guest_vsid == gvsid) {
+               dprintk_slb("SLB: Searching 0x%llx -> 0x%llx\n",
+                           gvsid, map->host_vsid);
+               return map;
+       }
+
+       dprintk_slb("SLB: Searching 0x%llx -> not found\n", gvsid);
+       return NULL;
+}
+
+int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
+{
+       pfn_t hpaddr;
+       ulong hash, hpteg, va;
+       u64 vsid;
+       int ret;
+       int rflags = 0x192;
+       int vflags = 0;
+       int attempt = 0;
+       struct kvmppc_sid_map *map;
+
+       /* Get host physical address for gpa */
+       hpaddr = gfn_to_pfn(vcpu->kvm, orig_pte->raddr >> PAGE_SHIFT);
+       if (kvm_is_error_hva(hpaddr)) {
+               printk(KERN_INFO "Couldn't get guest page for gfn %llx!\n", orig_pte->eaddr);
+               return -EINVAL;
+       }
+       hpaddr <<= PAGE_SHIFT;
+#if PAGE_SHIFT == 12
+#elif PAGE_SHIFT == 16
+       hpaddr |= orig_pte->raddr & 0xf000;
+#else
+#error Unknown page size
+#endif
+
+       /* and write the mapping ea -> hpa into the pt */
+       vcpu->arch.mmu.esid_to_vsid(vcpu, orig_pte->eaddr >> SID_SHIFT, &vsid);
+       map = find_sid_vsid(vcpu, vsid);
+       if (!map) {
+               kvmppc_mmu_map_segment(vcpu, orig_pte->eaddr);
+               map = find_sid_vsid(vcpu, vsid);
+       }
+       BUG_ON(!map);
+
+       vsid = map->host_vsid;
+       va = hpt_va(orig_pte->eaddr, vsid, MMU_SEGSIZE_256M);
+
+       if (!orig_pte->may_write)
+               rflags |= HPTE_R_PP;
+       else
+               mark_page_dirty(vcpu->kvm, orig_pte->raddr >> PAGE_SHIFT);
+
+       if (!orig_pte->may_execute)
+               rflags |= HPTE_R_N;
+
+       hash = hpt_hash(va, PTE_SIZE, MMU_SEGSIZE_256M);
+
+map_again:
+       hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP);
+
+       /* In case we tried normal mapping already, let's nuke old entries */
+       if (attempt > 1)
+               if (ppc_md.hpte_remove(hpteg) < 0)
+                       return -1;
+
+       ret = ppc_md.hpte_insert(hpteg, va, hpaddr, rflags, vflags, MMU_PAGE_4K, MMU_SEGSIZE_256M);
+
+       if (ret < 0) {
+               /* If we couldn't map a primary PTE, try a secondary */
+#ifdef USE_SECONDARY
+               hash = ~hash;
+               attempt++;
+               if (attempt % 2)
+                       vflags = HPTE_V_SECONDARY;
+               else
+                       vflags = 0;
+#else
+               attempt = 2;
+#endif
+               goto map_again;
+       } else {
+               int hpte_id = kvmppc_mmu_hpte_cache_next(vcpu);
+               struct hpte_cache *pte = &vcpu->arch.hpte_cache[hpte_id];
+
+               dprintk_mmu("KVM: %c%c Map 0x%llx: [%lx] 0x%lx (0x%llx) -> %lx\n",
+                           ((rflags & HPTE_R_PP) == 3) ? '-' : 'w',
+                           (rflags & HPTE_R_N) ? '-' : 'x',
+                           orig_pte->eaddr, hpteg, va, orig_pte->vpage, hpaddr);
+
+               pte->slot = hpteg + (ret & 7);
+               pte->host_va = va;
+               pte->pte = *orig_pte;
+               pte->pfn = hpaddr >> PAGE_SHIFT;
+       }
+
+       return 0;
+}
+
+static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
+{
+       struct kvmppc_sid_map *map;
+       struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
+       u16 sid_map_mask;
+       static int backwards_map = 0;
+
+       if (vcpu->arch.msr & MSR_PR)
+               gvsid |= VSID_PR;
+
+       /* We might get collisions that trap in preceding order, so let's
+          map them differently */
+
+       sid_map_mask = kvmppc_sid_hash(vcpu, gvsid);
+       if (backwards_map)
+               sid_map_mask = SID_MAP_MASK - sid_map_mask;
+
+       map = &to_book3s(vcpu)->sid_map[sid_map_mask];
+
+       /* Make sure we're taking the other map next time */
+       backwards_map = !backwards_map;
+
+       /* Uh-oh ... out of mappings. Let's flush! */
+       if (vcpu_book3s->vsid_next == vcpu_book3s->vsid_max) {
+               vcpu_book3s->vsid_next = vcpu_book3s->vsid_first;
+               memset(vcpu_book3s->sid_map, 0,
+                      sizeof(struct kvmppc_sid_map) * SID_MAP_NUM);
+               kvmppc_mmu_pte_flush(vcpu, 0, 0);
+               kvmppc_mmu_flush_segments(vcpu);
+       }
+       map->host_vsid = vcpu_book3s->vsid_next++;
+
+       map->guest_vsid = gvsid;
+       map->valid = true;
+
+       return map;
+}
+
+static int kvmppc_mmu_next_segment(struct kvm_vcpu *vcpu, ulong esid)
+{
+       int i;
+       int max_slb_size = 64;
+       int found_inval = -1;
+       int r;
+
+       if (!get_paca()->kvm_slb_max)
+               get_paca()->kvm_slb_max = 1;
+
+       /* Are we overwriting? */
+       for (i = 1; i < get_paca()->kvm_slb_max; i++) {
+               if (!(get_paca()->kvm_slb[i].esid & SLB_ESID_V))
+                       found_inval = i;
+               else if ((get_paca()->kvm_slb[i].esid & ESID_MASK) == esid)
+                       return i;
+       }
+
+       /* Found a spare entry that was invalidated before */
+       if (found_inval > 0)
+               return found_inval;
+
+       /* No spare invalid entry, so create one */
+
+       if (mmu_slb_size < 64)
+               max_slb_size = mmu_slb_size;
+
+       /* Overflowing -> purge */
+       if ((get_paca()->kvm_slb_max) == max_slb_size)
+               kvmppc_mmu_flush_segments(vcpu);
+
+       r = get_paca()->kvm_slb_max;
+       get_paca()->kvm_slb_max++;
+
+       return r;
+}
+
+int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr)
+{
+       u64 esid = eaddr >> SID_SHIFT;
+       u64 slb_esid = (eaddr & ESID_MASK) | SLB_ESID_V;
+       u64 slb_vsid = SLB_VSID_USER;
+       u64 gvsid;
+       int slb_index;
+       struct kvmppc_sid_map *map;
+
+       slb_index = kvmppc_mmu_next_segment(vcpu, eaddr & ESID_MASK);
+
+       if (vcpu->arch.mmu.esid_to_vsid(vcpu, esid, &gvsid)) {
+               /* Invalidate an entry */
+               get_paca()->kvm_slb[slb_index].esid = 0;
+               return -ENOENT;
+       }
+
+       map = find_sid_vsid(vcpu, gvsid);
+       if (!map)
+               map = create_sid_map(vcpu, gvsid);
+
+       map->guest_esid = esid;
+
+       slb_vsid |= (map->host_vsid << 12);
+       slb_vsid &= ~SLB_VSID_KP;
+       slb_esid |= slb_index;
+
+       get_paca()->kvm_slb[slb_index].esid = slb_esid;
+       get_paca()->kvm_slb[slb_index].vsid = slb_vsid;
+
+       dprintk_slb("slbmte %#llx, %#llx\n", slb_vsid, slb_esid);
+
+       return 0;
+}
+
+void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
+{
+       get_paca()->kvm_slb_max = 1;
+       get_paca()->kvm_slb[0].esid = 0;
+}
+
+void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
+{
+       kvmppc_mmu_pte_flush(vcpu, 0, 0);
+}
diff --git a/arch/powerpc/kvm/book3s_64_rmhandlers.S b/arch/powerpc/kvm/book3s_64_rmhandlers.S
new file mode 100644 (file)
index 0000000..fb7dd2e
--- /dev/null
@@ -0,0 +1,131 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/page.h>
+#include <asm/asm-offsets.h>
+#include <asm/exception-64s.h>
+
+/*****************************************************************************
+ *                                                                           *
+ *        Real Mode handlers that need to be in low physical memory          *
+ *                                                                           *
+ ****************************************************************************/
+
+
+.macro INTERRUPT_TRAMPOLINE intno
+
+.global kvmppc_trampoline_\intno
+kvmppc_trampoline_\intno:
+
+       mtspr   SPRN_SPRG_SCRATCH0, r13         /* Save r13 */
+
+       /*
+        * First thing to do is to find out if we're coming
+        * from a KVM guest or a Linux process.
+        *
+        * To distinguish, we check a magic byte in the PACA
+        */
+       mfspr   r13, SPRN_SPRG_PACA             /* r13 = PACA */
+       std     r12, (PACA_EXMC + EX_R12)(r13)
+       mfcr    r12
+       stw     r12, (PACA_EXMC + EX_CCR)(r13)
+       lbz     r12, PACA_KVM_IN_GUEST(r13)
+       cmpwi   r12, 0
+       bne     ..kvmppc_handler_hasmagic_\intno
+       /* No KVM guest? Then jump back to the Linux handler! */
+       lwz     r12, (PACA_EXMC + EX_CCR)(r13)
+       mtcr    r12
+       ld      r12, (PACA_EXMC + EX_R12)(r13)
+       mfspr   r13, SPRN_SPRG_SCRATCH0         /* r13 = original r13 */
+       b       kvmppc_resume_\intno            /* Get back original handler */
+
+       /* Now we know we're handling a KVM guest */
+..kvmppc_handler_hasmagic_\intno:
+       /* Unset guest state */
+       li      r12, 0
+       stb     r12, PACA_KVM_IN_GUEST(r13)
+
+       std     r1, (PACA_EXMC+EX_R9)(r13)
+       std     r10, (PACA_EXMC+EX_R10)(r13)
+       std     r11, (PACA_EXMC+EX_R11)(r13)
+       std     r2, (PACA_EXMC+EX_R13)(r13)
+
+       mfsrr0  r10
+       mfsrr1  r11
+
+       /* Restore R1/R2 so we can handle faults */
+       ld      r1, PACAR1(r13)
+       ld      r2, (PACA_EXMC+EX_SRR0)(r13)
+
+       /* Let's store which interrupt we're handling */
+       li      r12, \intno
+
+       /* Jump into the SLB exit code that goes to the highmem handler */
+       b       kvmppc_handler_trampoline_exit
+
+.endm
+
+INTERRUPT_TRAMPOLINE   BOOK3S_INTERRUPT_SYSTEM_RESET
+INTERRUPT_TRAMPOLINE   BOOK3S_INTERRUPT_MACHINE_CHECK
+INTERRUPT_TRAMPOLINE   BOOK3S_INTERRUPT_DATA_STORAGE
+INTERRUPT_TRAMPOLINE   BOOK3S_INTERRUPT_DATA_SEGMENT
+INTERRUPT_TRAMPOLINE   BOOK3S_INTERRUPT_INST_STORAGE
+INTERRUPT_TRAMPOLINE   BOOK3S_INTERRUPT_INST_SEGMENT
+INTERRUPT_TRAMPOLINE   BOOK3S_INTERRUPT_EXTERNAL
+INTERRUPT_TRAMPOLINE   BOOK3S_INTERRUPT_ALIGNMENT
+INTERRUPT_TRAMPOLINE   BOOK3S_INTERRUPT_PROGRAM
+INTERRUPT_TRAMPOLINE   BOOK3S_INTERRUPT_FP_UNAVAIL
+INTERRUPT_TRAMPOLINE   BOOK3S_INTERRUPT_DECREMENTER
+INTERRUPT_TRAMPOLINE   BOOK3S_INTERRUPT_SYSCALL
+INTERRUPT_TRAMPOLINE   BOOK3S_INTERRUPT_TRACE
+INTERRUPT_TRAMPOLINE   BOOK3S_INTERRUPT_PERFMON
+INTERRUPT_TRAMPOLINE   BOOK3S_INTERRUPT_ALTIVEC
+INTERRUPT_TRAMPOLINE   BOOK3S_INTERRUPT_VSX
+
+/*
+ * This trampoline brings us back to a real mode handler
+ *
+ * Input Registers:
+ *
+ * R6 = SRR0
+ * R7 = SRR1
+ * LR = real-mode IP
+ *
+ */
+.global kvmppc_handler_lowmem_trampoline
+kvmppc_handler_lowmem_trampoline:
+
+       mtsrr0  r6
+       mtsrr1  r7
+       blr
+kvmppc_handler_lowmem_trampoline_end:
+
+.global kvmppc_trampoline_lowmem
+kvmppc_trampoline_lowmem:
+       .long kvmppc_handler_lowmem_trampoline - _stext
+
+.global kvmppc_trampoline_enter
+kvmppc_trampoline_enter:
+       .long kvmppc_handler_trampoline_enter - _stext
+
+#include "book3s_64_slb.S"
+
diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S
new file mode 100644 (file)
index 0000000..ecd237a
--- /dev/null
@@ -0,0 +1,262 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#define SHADOW_SLB_ESID(num)   (SLBSHADOW_SAVEAREA + (num * 0x10))
+#define SHADOW_SLB_VSID(num)   (SLBSHADOW_SAVEAREA + (num * 0x10) + 0x8)
+#define UNBOLT_SLB_ENTRY(num) \
+       ld      r9, SHADOW_SLB_ESID(num)(r12); \
+       /* Invalid? Skip. */; \
+       rldicl. r0, r9, 37, 63; \
+       beq     slb_entry_skip_ ## num; \
+       xoris   r9, r9, SLB_ESID_V@h; \
+       std     r9, SHADOW_SLB_ESID(num)(r12); \
+  slb_entry_skip_ ## num:
+
+#define REBOLT_SLB_ENTRY(num) \
+       ld      r10, SHADOW_SLB_ESID(num)(r11); \
+       cmpdi   r10, 0; \
+       beq     slb_exit_skip_1; \
+       oris    r10, r10, SLB_ESID_V@h; \
+       ld      r9, SHADOW_SLB_VSID(num)(r11); \
+       slbmte  r9, r10; \
+       std     r10, SHADOW_SLB_ESID(num)(r11); \
+slb_exit_skip_ ## num:
+
+/******************************************************************************
+ *                                                                            *
+ *                               Entry code                                   *
+ *                                                                            *
+ *****************************************************************************/
+
+.global kvmppc_handler_trampoline_enter
+kvmppc_handler_trampoline_enter:
+
+       /* Required state:
+        *
+        * MSR = ~IR|DR
+        * R13 = PACA
+        * R9 = guest IP
+        * R10 = guest MSR
+        * R11 = free
+        * R12 = free
+        * PACA[PACA_EXMC + EX_R9] = guest R9
+        * PACA[PACA_EXMC + EX_R10] = guest R10
+        * PACA[PACA_EXMC + EX_R11] = guest R11
+        * PACA[PACA_EXMC + EX_R12] = guest R12
+        * PACA[PACA_EXMC + EX_R13] = guest R13
+        * PACA[PACA_EXMC + EX_CCR] = guest CR
+        * PACA[PACA_EXMC + EX_R3] = guest XER
+        */
+
+       mtsrr0  r9
+       mtsrr1  r10
+
+       mtspr   SPRN_SPRG_SCRATCH0, r0
+
+       /* Remove LPAR shadow entries */
+
+#if SLB_NUM_BOLTED == 3
+
+       ld      r12, PACA_SLBSHADOWPTR(r13)
+
+       /* Save off the first entry so we can slbie it later */
+       ld      r10, SHADOW_SLB_ESID(0)(r12)
+       ld      r11, SHADOW_SLB_VSID(0)(r12)
+
+       /* Remove bolted entries */
+       UNBOLT_SLB_ENTRY(0)
+       UNBOLT_SLB_ENTRY(1)
+       UNBOLT_SLB_ENTRY(2)
+       
+#else
+#error unknown number of bolted entries
+#endif
+
+       /* Flush SLB */
+
+       slbia
+
+       /* r0 = esid & ESID_MASK */
+       rldicr  r10, r10, 0, 35
+       /* r0 |= CLASS_BIT(VSID) */
+       rldic   r12, r11, 56 - 36, 36
+       or      r10, r10, r12
+       slbie   r10
+
+       isync
+
+       /* Fill SLB with our shadow */
+
+       lbz     r12, PACA_KVM_SLB_MAX(r13)
+       mulli   r12, r12, 16
+       addi    r12, r12, PACA_KVM_SLB
+       add     r12, r12, r13
+
+       /* for (r11 = kvm_slb; r11 < kvm_slb + kvm_slb_size; r11+=slb_entry) */
+       li      r11, PACA_KVM_SLB
+       add     r11, r11, r13
+
+slb_loop_enter:
+
+       ld      r10, 0(r11)
+
+       rldicl. r0, r10, 37, 63
+       beq     slb_loop_enter_skip
+
+       ld      r9, 8(r11)
+       slbmte  r9, r10
+
+slb_loop_enter_skip:
+       addi    r11, r11, 16
+       cmpd    cr0, r11, r12
+       blt     slb_loop_enter
+
+slb_do_enter:
+
+       /* Enter guest */
+
+       mfspr   r0, SPRN_SPRG_SCRATCH0
+
+       ld      r9, (PACA_EXMC+EX_R9)(r13)
+       ld      r10, (PACA_EXMC+EX_R10)(r13)
+       ld      r12, (PACA_EXMC+EX_R12)(r13)
+
+       lwz     r11, (PACA_EXMC+EX_CCR)(r13)
+       mtcr    r11
+
+       ld      r11, (PACA_EXMC+EX_R3)(r13)
+       mtxer   r11
+
+       ld      r11, (PACA_EXMC+EX_R11)(r13)
+       ld      r13, (PACA_EXMC+EX_R13)(r13)
+
+       RFI
+kvmppc_handler_trampoline_enter_end:
+
+
+
+/******************************************************************************
+ *                                                                            *
+ *                               Exit code                                    *
+ *                                                                            *
+ *****************************************************************************/
+
+.global kvmppc_handler_trampoline_exit
+kvmppc_handler_trampoline_exit:
+
+       /* Register usage at this point:
+        *
+        * SPRG_SCRATCH0 = guest R13
+        * R01           = host R1
+        * R02           = host R2
+        * R10           = guest PC
+        * R11           = guest MSR
+        * R12           = exit handler id
+        * R13           = PACA
+        * PACA.exmc.CCR  = guest CR
+        * PACA.exmc.R9  = guest R1
+        * PACA.exmc.R10 = guest R10
+        * PACA.exmc.R11 = guest R11
+        * PACA.exmc.R12 = guest R12
+        * PACA.exmc.R13 = guest R2
+        *
+        */
+
+       /* Save registers */
+
+       std     r0, (PACA_EXMC+EX_SRR0)(r13)
+       std     r9, (PACA_EXMC+EX_R3)(r13)
+       std     r10, (PACA_EXMC+EX_LR)(r13)
+       std     r11, (PACA_EXMC+EX_DAR)(r13)
+
+       /*
+        * In order for us to easily get the last instruction,
+        * we got the #vmexit at, we exploit the fact that the
+        * virtual layout is still the same here, so we can just
+        * ld from the guest's PC address
+        */
+
+       /* We only load the last instruction when it's safe */
+       cmpwi   r12, BOOK3S_INTERRUPT_DATA_STORAGE
+       beq     ld_last_inst
+       cmpwi   r12, BOOK3S_INTERRUPT_PROGRAM
+       beq     ld_last_inst
+
+       b       no_ld_last_inst
+
+ld_last_inst:
+       /* Save off the guest instruction we're at */
+       /*    1) enable paging for data */
+       mfmsr   r9
+       ori     r11, r9, MSR_DR                 /* Enable paging for data */
+       mtmsr   r11
+       /*    2) fetch the instruction */
+       lwz     r0, 0(r10)
+       /*    3) disable paging again */
+       mtmsr   r9
+
+no_ld_last_inst:
+
+       /* Restore bolted entries from the shadow and fix it along the way */
+
+       /* We don't store anything in entry 0, so we don't need to take care of it */
+       slbia
+       isync
+
+#if SLB_NUM_BOLTED == 3
+
+       ld      r11, PACA_SLBSHADOWPTR(r13)
+
+       REBOLT_SLB_ENTRY(0)
+       REBOLT_SLB_ENTRY(1)
+       REBOLT_SLB_ENTRY(2)
+       
+#else
+#error unknown number of bolted entries
+#endif
+
+slb_do_exit:
+
+       /* Restore registers */
+
+       ld      r11, (PACA_EXMC+EX_DAR)(r13)
+       ld      r10, (PACA_EXMC+EX_LR)(r13)
+       ld      r9, (PACA_EXMC+EX_R3)(r13)
+
+       /* Save last inst */
+       stw     r0, (PACA_EXMC+EX_LR)(r13)
+
+       /* Save DAR and DSISR before going to paged mode */
+       mfdar   r0
+       std     r0, (PACA_EXMC+EX_DAR)(r13)
+       mfdsisr r0
+       stw     r0, (PACA_EXMC+EX_DSISR)(r13)
+
+       /* RFI into the highmem handler */
+       mfmsr   r0
+       ori     r0, r0, MSR_IR|MSR_DR|MSR_RI    /* Enable paging */
+       mtsrr1  r0
+       ld      r0, PACASAVEDMSR(r13)           /* Highmem handler address */
+       mtsrr0  r0
+
+       mfspr   r0, SPRN_SPRG_SCRATCH0
+
+       RFI
+kvmppc_handler_trampoline_exit_end:
+
index e7bf4d029484df334b2c1a60df6a1559b1932769..06f5a9ecc42c9fc950666b44a177b7eadeb668ed 100644 (file)
@@ -520,6 +520,11 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
        return kvmppc_core_vcpu_translate(vcpu, tr);
 }
 
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
+{
+       return -ENOTSUPP;
+}
+
 int __init kvmppc_booke_init(void)
 {
        unsigned long ivor[16];
index 7737146af3fb4c181a188c479786f8349a02c9ca..4a9ac6640fadb93182d9ade73310f1af1e0c4b93 100644 (file)
@@ -18,7 +18,7 @@
  */
 
 #include <linux/jiffies.h>
-#include <linux/timer.h>
+#include <linux/hrtimer.h>
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/kvm_host.h>
@@ -32,6 +32,7 @@
 #include "trace.h"
 
 #define OP_TRAP 3
+#define OP_TRAP_64 2
 
 #define OP_31_XOP_LWZX      23
 #define OP_31_XOP_LBZX      87
 #define OP_STH  44
 #define OP_STHU 45
 
+#ifdef CONFIG_PPC64
+static int kvmppc_dec_enabled(struct kvm_vcpu *vcpu)
+{
+       return 1;
+}
+#else
+static int kvmppc_dec_enabled(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.tcr & TCR_DIE;
+}
+#endif
+
 void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)
 {
-       if (vcpu->arch.tcr & TCR_DIE) {
+       unsigned long dec_nsec;
+
+       pr_debug("mtDEC: %x\n", vcpu->arch.dec);
+#ifdef CONFIG_PPC64
+       /* POWER4+ triggers a dec interrupt if the value is < 0 */
+       if (vcpu->arch.dec & 0x80000000) {
+               hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+               kvmppc_core_queue_dec(vcpu);
+               return;
+       }
+#endif
+       if (kvmppc_dec_enabled(vcpu)) {
                /* The decrementer ticks at the same rate as the timebase, so
                 * that's how we convert the guest DEC value to the number of
                 * host ticks. */
-               unsigned long nr_jiffies;
 
-               nr_jiffies = vcpu->arch.dec / tb_ticks_per_jiffy;
-               mod_timer(&vcpu->arch.dec_timer,
-                         get_jiffies_64() + nr_jiffies);
+               hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+               dec_nsec = vcpu->arch.dec;
+               dec_nsec *= 1000;
+               dec_nsec /= tb_ticks_per_usec;
+               hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec),
+                             HRTIMER_MODE_REL);
+               vcpu->arch.dec_jiffies = get_tb();
        } else {
-               del_timer(&vcpu->arch.dec_timer);
+               hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
        }
 }
 
@@ -111,9 +138,15 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
        /* this default type might be overwritten by subcategories */
        kvmppc_set_exit_type(vcpu, EMULATED_INST_EXITS);
 
+       pr_debug(KERN_INFO "Emulating opcode %d / %d\n", get_op(inst), get_xop(inst));
+
        switch (get_op(inst)) {
        case OP_TRAP:
+#ifdef CONFIG_PPC64
+       case OP_TRAP_64:
+#else
                vcpu->arch.esr |= ESR_PTR;
+#endif
                kvmppc_core_queue_program(vcpu);
                advance = 0;
                break;
@@ -188,17 +221,19 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                        case SPRN_SRR1:
                                vcpu->arch.gpr[rt] = vcpu->arch.srr1; break;
                        case SPRN_PVR:
-                               vcpu->arch.gpr[rt] = mfspr(SPRN_PVR); break;
+                               vcpu->arch.gpr[rt] = vcpu->arch.pvr; break;
                        case SPRN_PIR:
-                               vcpu->arch.gpr[rt] = mfspr(SPRN_PIR); break;
+                               vcpu->arch.gpr[rt] = vcpu->vcpu_id; break;
+                       case SPRN_MSSSR0:
+                               vcpu->arch.gpr[rt] = 0; break;
 
                        /* Note: mftb and TBRL/TBWL are user-accessible, so
                         * the guest can always access the real TB anyways.
                         * In fact, we probably will never see these traps. */
                        case SPRN_TBWL:
-                               vcpu->arch.gpr[rt] = mftbl(); break;
+                               vcpu->arch.gpr[rt] = get_tb() >> 32; break;
                        case SPRN_TBWU:
-                               vcpu->arch.gpr[rt] = mftbu(); break;
+                               vcpu->arch.gpr[rt] = get_tb(); break;
 
                        case SPRN_SPRG0:
                                vcpu->arch.gpr[rt] = vcpu->arch.sprg0; break;
@@ -211,6 +246,13 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                        /* Note: SPRG4-7 are user-readable, so we don't get
                         * a trap. */
 
+                       case SPRN_DEC:
+                       {
+                               u64 jd = get_tb() - vcpu->arch.dec_jiffies;
+                               vcpu->arch.gpr[rt] = vcpu->arch.dec - jd;
+                               pr_debug(KERN_INFO "mfDEC: %x - %llx = %lx\n", vcpu->arch.dec, jd, vcpu->arch.gpr[rt]);
+                               break;
+                       }
                        default:
                                emulated = kvmppc_core_emulate_mfspr(vcpu, sprn, rt);
                                if (emulated == EMULATE_FAIL) {
@@ -260,6 +302,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
                        case SPRN_TBWL: break;
                        case SPRN_TBWU: break;
 
+                       case SPRN_MSSSR0: break;
+
                        case SPRN_DEC:
                                vcpu->arch.dec = vcpu->arch.gpr[rs];
                                kvmppc_emulate_dec(vcpu);
index 2a4551f78f60c242d3477b6742efaf8aa322aaab..692c3709011ee409d7f2a42607d28f2bff46b92f 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/kvm_host.h>
 #include <linux/module.h>
 #include <linux/vmalloc.h>
+#include <linux/hrtimer.h>
 #include <linux/fs.h>
 #include <asm/cputable.h>
 #include <asm/uaccess.h>
@@ -208,10 +209,25 @@ static void kvmppc_decrementer_func(unsigned long data)
        }
 }
 
+/*
+ * low level hrtimer wake routine. Because this runs in hardirq context
+ * we schedule a tasklet to do the real work.
+ */
+enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer)
+{
+       struct kvm_vcpu *vcpu;
+
+       vcpu = container_of(timer, struct kvm_vcpu, arch.dec_timer);
+       tasklet_schedule(&vcpu->arch.tasklet);
+
+       return HRTIMER_NORESTART;
+}
+
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 {
-       setup_timer(&vcpu->arch.dec_timer, kvmppc_decrementer_func,
-                   (unsigned long)vcpu);
+       hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
+       tasklet_init(&vcpu->arch.tasklet, kvmppc_decrementer_func, (ulong)vcpu);
+       vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup;
 
        return 0;
 }
@@ -409,11 +425,6 @@ out:
        return r;
 }
 
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
-{
-       return -ENOTSUPP;
-}
-
 long kvm_arch_vm_ioctl(struct file *filp,
                        unsigned int ioctl, unsigned long arg)
 {
index 2aa371e30079ced2c7938133b355ae7ca54b7343..70378551c0cc1b0096383ba89b314a963654de1d 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
+#include <linux/module.h>
 
 #include <asm/time.h>
 #include <asm-generic/div64.h>
index 67f219de04558e3abd15a249e94c8c4d4470bcc0..a8e8400180525b4e9a0c6894ddc100b286140626 100644 (file)
@@ -12,8 +12,8 @@
  * Tracepoint for guest mode entry.
  */
 TRACE_EVENT(kvm_ppc_instr,
-       TP_PROTO(unsigned int inst, unsigned long pc, unsigned int emulate),
-       TP_ARGS(inst, pc, emulate),
+       TP_PROTO(unsigned int inst, unsigned long _pc, unsigned int emulate),
+       TP_ARGS(inst, _pc, emulate),
 
        TP_STRUCT__entry(
                __field(        unsigned int,   inst            )
@@ -23,7 +23,7 @@ TRACE_EVENT(kvm_ppc_instr,
 
        TP_fast_assign(
                __entry->inst           = inst;
-               __entry->pc             = pc;
+               __entry->pc             = _pc;
                __entry->emulate        = emulate;
        ),
 
index 6fb8fc8d2feafb9993ccc3f7457d0ff76e1f2587..ce68708bbad53c016986eabf7dfe2ef08ba33a97 100644 (file)
@@ -28,7 +28,10 @@ obj-$(CONFIG_44x)            += 44x_mmu.o
 obj-$(CONFIG_FSL_BOOKE)                += fsl_booke_mmu.o
 obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o
 obj-$(CONFIG_PPC_MM_SLICES)    += slice.o
-obj-$(CONFIG_HUGETLB_PAGE)     += hugetlbpage.o
+ifeq ($(CONFIG_HUGETLB_PAGE),y)
+obj-y                          += hugetlbpage.o
+obj-$(CONFIG_PPC_STD_MMU_64)   += hugetlbpage-hash64.o
+endif
 obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o
 obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
 obj-$(CONFIG_HIGHMEM)          += highmem.o
index bc122a120bf009c7b6457e56936d5e9f47071dd6..d7efdbf640c7d5e39cd8c3c55f79aa0b8ca64f10 100644 (file)
@@ -55,57 +55,6 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
        return 1;
 }
 
-#ifdef CONFIG_HUGETLB_PAGE
-static noinline int gup_huge_pte(pte_t *ptep, struct hstate *hstate,
-                                unsigned long *addr, unsigned long end,
-                                int write, struct page **pages, int *nr)
-{
-       unsigned long mask;
-       unsigned long pte_end;
-       struct page *head, *page;
-       pte_t pte;
-       int refs;
-
-       pte_end = (*addr + huge_page_size(hstate)) & huge_page_mask(hstate);
-       if (pte_end < end)
-               end = pte_end;
-
-       pte = *ptep;
-       mask = _PAGE_PRESENT|_PAGE_USER;
-       if (write)
-               mask |= _PAGE_RW;
-       if ((pte_val(pte) & mask) != mask)
-               return 0;
-       /* hugepages are never "special" */
-       VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
-
-       refs = 0;
-       head = pte_page(pte);
-       page = head + ((*addr & ~huge_page_mask(hstate)) >> PAGE_SHIFT);
-       do {
-               VM_BUG_ON(compound_head(page) != head);
-               pages[*nr] = page;
-               (*nr)++;
-               page++;
-               refs++;
-       } while (*addr += PAGE_SIZE, *addr != end);
-
-       if (!page_cache_add_speculative(head, refs)) {
-               *nr -= refs;
-               return 0;
-       }
-       if (unlikely(pte_val(pte) != pte_val(*ptep))) {
-               /* Could be optimized better */
-               while (*nr) {
-                       put_page(page);
-                       (*nr)--;
-               }
-       }
-
-       return 1;
-}
-#endif /* CONFIG_HUGETLB_PAGE */
-
 static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
                int write, struct page **pages, int *nr)
 {
@@ -119,7 +68,11 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
                next = pmd_addr_end(addr, end);
                if (pmd_none(pmd))
                        return 0;
-               if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+               if (is_hugepd(pmdp)) {
+                       if (!gup_hugepd((hugepd_t *)pmdp, PMD_SHIFT,
+                                       addr, next, write, pages, nr))
+                               return 0;
+               } else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
                        return 0;
        } while (pmdp++, addr = next, addr != end);
 
@@ -139,7 +92,11 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
                next = pud_addr_end(addr, end);
                if (pud_none(pud))
                        return 0;
-               if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+               if (is_hugepd(pudp)) {
+                       if (!gup_hugepd((hugepd_t *)pudp, PUD_SHIFT,
+                                       addr, next, write, pages, nr))
+                               return 0;
+               } else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
                        return 0;
        } while (pudp++, addr = next, addr != end);
 
@@ -154,10 +111,6 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
        unsigned long next;
        pgd_t *pgdp;
        int nr = 0;
-#ifdef CONFIG_PPC64
-       unsigned int shift;
-       int psize;
-#endif
 
        pr_devel("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read");
 
@@ -172,25 +125,6 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
 
        pr_devel("  aligned: %lx .. %lx\n", start, end);
 
-#ifdef CONFIG_HUGETLB_PAGE
-       /* We bail out on slice boundary crossing when hugetlb is
-        * enabled in order to not have to deal with two different
-        * page table formats
-        */
-       if (addr < SLICE_LOW_TOP) {
-               if (end > SLICE_LOW_TOP)
-                       goto slow_irqon;
-
-               if (unlikely(GET_LOW_SLICE_INDEX(addr) !=
-                            GET_LOW_SLICE_INDEX(end - 1)))
-                       goto slow_irqon;
-       } else {
-               if (unlikely(GET_HIGH_SLICE_INDEX(addr) !=
-                            GET_HIGH_SLICE_INDEX(end - 1)))
-                       goto slow_irqon;
-       }
-#endif /* CONFIG_HUGETLB_PAGE */
-
        /*
         * XXX: batch / limit 'nr', to avoid large irq off latency
         * needs some instrumenting to determine the common sizes used by
@@ -210,54 +144,23 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
         */
        local_irq_disable();
 
-#ifdef CONFIG_PPC64
-       /* Those bits are related to hugetlbfs implementation and only exist
-        * on 64-bit for now
-        */
-       psize = get_slice_psize(mm, addr);
-       shift = mmu_psize_defs[psize].shift;
-#endif /* CONFIG_PPC64 */
-
-#ifdef CONFIG_HUGETLB_PAGE
-       if (unlikely(mmu_huge_psizes[psize])) {
-               pte_t *ptep;
-               unsigned long a = addr;
-               unsigned long sz = ((1UL) << shift);
-               struct hstate *hstate = size_to_hstate(sz);
-
-               BUG_ON(!hstate);
-               /*
-                * XXX: could be optimized to avoid hstate
-                * lookup entirely (just use shift)
-                */
-
-               do {
-                       VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, a)].shift);
-                       ptep = huge_pte_offset(mm, a);
-                       pr_devel(" %016lx: huge ptep %p\n", a, ptep);
-                       if (!ptep || !gup_huge_pte(ptep, hstate, &a, end, write, pages,
-                                                  &nr))
-                               goto slow;
-               } while (a != end);
-       } else
-#endif /* CONFIG_HUGETLB_PAGE */
-       {
-               pgdp = pgd_offset(mm, addr);
-               do {
-                       pgd_t pgd = *pgdp;
-
-#ifdef CONFIG_PPC64
-                       VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, addr)].shift);
-#endif
-                       pr_devel("  %016lx: normal pgd %p\n", addr,
-                                (void *)pgd_val(pgd));
-                       next = pgd_addr_end(addr, end);
-                       if (pgd_none(pgd))
-                               goto slow;
-                       if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+       pgdp = pgd_offset(mm, addr);
+       do {
+               pgd_t pgd = *pgdp;
+
+               pr_devel("  %016lx: normal pgd %p\n", addr,
+                        (void *)pgd_val(pgd));
+               next = pgd_addr_end(addr, end);
+               if (pgd_none(pgd))
+                       goto slow;
+               if (is_hugepd(pgdp)) {
+                       if (!gup_hugepd((hugepd_t *)pgdp, PGDIR_SHIFT,
+                                       addr, next, write, pages, &nr))
                                goto slow;
-               } while (pgdp++, addr = next, addr != end);
-       }
+               } else if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
+                       goto slow;
+       } while (pgdp++, addr = next, addr != end);
+
        local_irq_enable();
 
        VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
index 1ade7eb6ae00a3e8f352526f9b556cb05408943a..6810128aba305730f3a61ef4edd2fc2c49042c37 100644 (file)
@@ -92,6 +92,7 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT];
 struct hash_pte *htab_address;
 unsigned long htab_size_bytes;
 unsigned long htab_hash_mask;
+EXPORT_SYMBOL_GPL(htab_hash_mask);
 int mmu_linear_psize = MMU_PAGE_4K;
 int mmu_virtual_psize = MMU_PAGE_4K;
 int mmu_vmalloc_psize = MMU_PAGE_4K;
@@ -102,6 +103,7 @@ int mmu_io_psize = MMU_PAGE_4K;
 int mmu_kernel_ssize = MMU_SEGSIZE_256M;
 int mmu_highuser_ssize = MMU_SEGSIZE_256M;
 u16 mmu_slb_size = 64;
+EXPORT_SYMBOL_GPL(mmu_slb_size);
 #ifdef CONFIG_HUGETLB_PAGE
 unsigned int HPAGE_SHIFT;
 #endif
@@ -481,16 +483,6 @@ static void __init htab_init_page_sizes(void)
 #ifdef CONFIG_HUGETLB_PAGE
        /* Reserve 16G huge page memory sections for huge pages */
        of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL);
-
-/* Set default large page size. Currently, we pick 16M or 1M depending
-        * on what is available
-        */
-       if (mmu_psize_defs[MMU_PAGE_16M].shift)
-               HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
-       /* With 4k/4level pagetables, we can't (for now) cope with a
-        * huge page size < PMD_SIZE */
-       else if (mmu_psize_defs[MMU_PAGE_1M].shift)
-               HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
 #endif /* CONFIG_HUGETLB_PAGE */
 }
 
@@ -785,7 +777,7 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap)
        /* page is dirty */
        if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
                if (trap == 0x400) {
-                       __flush_dcache_icache(page_address(page));
+                       flush_dcache_icache_page(page);
                        set_bit(PG_arch_1, &page->flags);
                } else
                        pp |= HPTE_R_N;
@@ -891,6 +883,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
        unsigned long vsid;
        struct mm_struct *mm;
        pte_t *ptep;
+       unsigned hugeshift;
        const struct cpumask *tmp;
        int rc, user_region = 0, local = 0;
        int psize, ssize;
@@ -943,30 +936,31 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap)
        if (user_region && cpumask_equal(mm_cpumask(mm), tmp))
                local = 1;
 
-#ifdef CONFIG_HUGETLB_PAGE
-       /* Handle hugepage regions */
-       if (HPAGE_SHIFT && mmu_huge_psizes[psize]) {
-               DBG_LOW(" -> huge page !\n");
-               return hash_huge_page(mm, access, ea, vsid, local, trap);
-       }
-#endif /* CONFIG_HUGETLB_PAGE */
-
 #ifndef CONFIG_PPC_64K_PAGES
-       /* If we use 4K pages and our psize is not 4K, then we are hitting
-        * a special driver mapping, we need to align the address before
-        * we fetch the PTE
+       /* If we use 4K pages and our psize is not 4K, then we might
+        * be hitting a special driver mapping, and need to align the
+        * address before we fetch the PTE.
+        *
+        * It could also be a hugepage mapping, in which case this is
+        * not necessary, but it's not harmful, either.
         */
        if (psize != MMU_PAGE_4K)
                ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
 #endif /* CONFIG_PPC_64K_PAGES */
 
        /* Get PTE and page size from page tables */
-       ptep = find_linux_pte(pgdir, ea);
+       ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugeshift);
        if (ptep == NULL || !pte_present(*ptep)) {
                DBG_LOW(" no PTE !\n");
                return 1;
        }
 
+#ifdef CONFIG_HUGETLB_PAGE
+       if (hugeshift)
+               return __hash_page_huge(ea, access, vsid, ptep, trap, local,
+                                       ssize, hugeshift, psize);
+#endif /* CONFIG_HUGETLB_PAGE */
+
 #ifndef CONFIG_PPC_64K_PAGES
        DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep));
 #else
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
new file mode 100644 (file)
index 0000000..1995398
--- /dev/null
@@ -0,0 +1,139 @@
+/*
+ * PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later)
+ *
+ * Copyright (C) 2003 David Gibson, IBM Corporation.
+ *
+ * Based on the IA-32 version:
+ * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/cacheflush.h>
+#include <asm/machdep.h>
+
+int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
+                    pte_t *ptep, unsigned long trap, int local, int ssize,
+                    unsigned int shift, unsigned int mmu_psize)
+{
+       unsigned long old_pte, new_pte;
+       unsigned long va, rflags, pa, sz;
+       long slot;
+       int err = 1;
+
+       BUG_ON(shift != mmu_psize_defs[mmu_psize].shift);
+
+       /* Search the Linux page table for a match with va */
+       va = hpt_va(ea, vsid, ssize);
+
+       /*
+        * Check the user's access rights to the page.  If access should be
+        * prevented then send the problem up to do_page_fault.
+        */
+       if (unlikely(access & ~pte_val(*ptep)))
+               goto out;
+       /*
+        * At this point, we have a pte (old_pte) which can be used to build
+        * or update an HPTE. There are 2 cases:
+        *
+        * 1. There is a valid (present) pte with no associated HPTE (this is
+        *      the most common case)
+        * 2. There is a valid (present) pte with an associated HPTE. The
+        *      current values of the pp bits in the HPTE prevent access
+        *      because we are doing software DIRTY bit management and the
+        *      page is currently not DIRTY.
+        */
+
+
+       do {
+               old_pte = pte_val(*ptep);
+               if (old_pte & _PAGE_BUSY)
+                       goto out;
+               new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
+       } while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
+                                        old_pte, new_pte));
+
+       rflags = 0x2 | (!(new_pte & _PAGE_RW));
+       /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
+       rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
+       sz = ((1UL) << shift);
+       if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
+               /* No CPU has hugepages but lacks no execute, so we
+                * don't need to worry about that case */
+               rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap);
+
+       /* Check if pte already has an hpte (case 2) */
+       if (unlikely(old_pte & _PAGE_HASHPTE)) {
+               /* There MIGHT be an HPTE for this pte */
+               unsigned long hash, slot;
+
+               hash = hpt_hash(va, shift, ssize);
+               if (old_pte & _PAGE_F_SECOND)
+                       hash = ~hash;
+               slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+               slot += (old_pte & _PAGE_F_GIX) >> 12;
+
+               if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize,
+                                        ssize, local) == -1)
+                       old_pte &= ~_PAGE_HPTEFLAGS;
+       }
+
+       if (likely(!(old_pte & _PAGE_HASHPTE))) {
+               unsigned long hash = hpt_hash(va, shift, ssize);
+               unsigned long hpte_group;
+
+               pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
+
+repeat:
+               hpte_group = ((hash & htab_hash_mask) *
+                             HPTES_PER_GROUP) & ~0x7UL;
+
+               /* clear HPTE slot informations in new PTE */
+#ifdef CONFIG_PPC_64K_PAGES
+               new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0;
+#else
+               new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
+#endif
+               /* Add in WIMG bits */
+               rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
+                                     _PAGE_COHERENT | _PAGE_GUARDED));
+
+               /* Insert into the hash table, primary slot */
+               slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
+                                         mmu_psize, ssize);
+
+               /* Primary is full, try the secondary */
+               if (unlikely(slot == -1)) {
+                       hpte_group = ((~hash & htab_hash_mask) *
+                                     HPTES_PER_GROUP) & ~0x7UL;
+                       slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
+                                                 HPTE_V_SECONDARY,
+                                                 mmu_psize, ssize);
+                       if (slot == -1) {
+                               if (mftb() & 0x1)
+                                       hpte_group = ((hash & htab_hash_mask) *
+                                                     HPTES_PER_GROUP)&~0x7UL;
+
+                               ppc_md.hpte_remove(hpte_group);
+                               goto repeat;
+                        }
+               }
+
+               if (unlikely(slot == -2))
+                       panic("hash_huge_page: pte_insert failed\n");
+
+               new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
+       }
+
+       /*
+        * No need to use ldarx/stdcx here
+        */
+       *ptep = __pte(new_pte & ~_PAGE_BUSY);
+
+       err = 0;
+
+ out:
+       return err;
+}
index 90df6ffe3a43140fcbfcb5ceaa312fdb7c642319..53b200abb025ba26fa006cf4b39491b11d1d20d8 100644 (file)
@@ -7,29 +7,17 @@
  * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
  */
 
-#include <linux/init.h>
-#include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/io.h>
 #include <linux/hugetlb.h>
-#include <linux/pagemap.h>
-#include <linux/slab.h>
-#include <linux/err.h>
-#include <linux/sysctl.h>
-#include <asm/mman.h>
+#include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
-#include <asm/tlbflush.h>
-#include <asm/mmu_context.h>
-#include <asm/machdep.h>
-#include <asm/cputable.h>
-#include <asm/spu.h>
 
 #define PAGE_SHIFT_64K 16
 #define PAGE_SHIFT_16M 24
 #define PAGE_SHIFT_16G 34
 
-#define NUM_LOW_AREAS  (0x100000000UL >> SID_SHIFT)
-#define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
 #define MAX_NUMBER_GPAGES      1024
 
 /* Tracks the 16G pages after the device tree is scanned and before the
 static unsigned long gpage_freearray[MAX_NUMBER_GPAGES];
 static unsigned nr_gpages;
 
-/* Array of valid huge page sizes - non-zero value(hugepte_shift) is
- * stored for the huge page sizes that are valid.
- */
-unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */
-
-#define hugepte_shift                  mmu_huge_psizes
-#define PTRS_PER_HUGEPTE(psize)                (1 << hugepte_shift[psize])
-#define HUGEPTE_TABLE_SIZE(psize)      (sizeof(pte_t) << hugepte_shift[psize])
-
-#define HUGEPD_SHIFT(psize)            (mmu_psize_to_shift(psize) \
-                                               + hugepte_shift[psize])
-#define HUGEPD_SIZE(psize)             (1UL << HUGEPD_SHIFT(psize))
-#define HUGEPD_MASK(psize)             (~(HUGEPD_SIZE(psize)-1))
-
-/* Subtract one from array size because we don't need a cache for 4K since
- * is not a huge page size */
-#define HUGE_PGTABLE_INDEX(psize)      (HUGEPTE_CACHE_NUM + psize - 1)
-#define HUGEPTE_CACHE_NAME(psize)      (huge_pgtable_cache_name[psize])
-
-static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = {
-       [MMU_PAGE_64K]  = "hugepte_cache_64K",
-       [MMU_PAGE_1M]   = "hugepte_cache_1M",
-       [MMU_PAGE_16M]  = "hugepte_cache_16M",
-       [MMU_PAGE_16G]  = "hugepte_cache_16G",
-};
-
 /* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
  * will choke on pointers to hugepte tables, which is handy for
  * catching screwups early. */
-#define HUGEPD_OK      0x1
-
-typedef struct { unsigned long pd; } hugepd_t;
-
-#define hugepd_none(hpd)       ((hpd).pd == 0)
 
 static inline int shift_to_mmu_psize(unsigned int shift)
 {
-       switch (shift) {
-#ifndef CONFIG_PPC_64K_PAGES
-       case PAGE_SHIFT_64K:
-           return MMU_PAGE_64K;
-#endif
-       case PAGE_SHIFT_16M:
-           return MMU_PAGE_16M;
-       case PAGE_SHIFT_16G:
-           return MMU_PAGE_16G;
-       }
+       int psize;
+
+       for (psize = 0; psize < MMU_PAGE_COUNT; ++psize)
+               if (mmu_psize_defs[psize].shift == shift)
+                       return psize;
        return -1;
 }
 
@@ -94,71 +46,126 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
        BUG();
 }
 
+#define hugepd_none(hpd)       ((hpd).pd == 0)
+
 static inline pte_t *hugepd_page(hugepd_t hpd)
 {
-       BUG_ON(!(hpd.pd & HUGEPD_OK));
-       return (pte_t *)(hpd.pd & ~HUGEPD_OK);
+       BUG_ON(!hugepd_ok(hpd));
+       return (pte_t *)((hpd.pd & ~HUGEPD_SHIFT_MASK) | 0xc000000000000000);
 }
 
-static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
-                                   struct hstate *hstate)
+static inline unsigned int hugepd_shift(hugepd_t hpd)
 {
-       unsigned int shift = huge_page_shift(hstate);
-       int psize = shift_to_mmu_psize(shift);
-       unsigned long idx = ((addr >> shift) & (PTRS_PER_HUGEPTE(psize)-1));
+       return hpd.pd & HUGEPD_SHIFT_MASK;
+}
+
+static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, unsigned pdshift)
+{
+       unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp);
        pte_t *dir = hugepd_page(*hpdp);
 
        return dir + idx;
 }
 
+pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
+{
+       pgd_t *pg;
+       pud_t *pu;
+       pmd_t *pm;
+       hugepd_t *hpdp = NULL;
+       unsigned pdshift = PGDIR_SHIFT;
+
+       if (shift)
+               *shift = 0;
+
+       pg = pgdir + pgd_index(ea);
+       if (is_hugepd(pg)) {
+               hpdp = (hugepd_t *)pg;
+       } else if (!pgd_none(*pg)) {
+               pdshift = PUD_SHIFT;
+               pu = pud_offset(pg, ea);
+               if (is_hugepd(pu))
+                       hpdp = (hugepd_t *)pu;
+               else if (!pud_none(*pu)) {
+                       pdshift = PMD_SHIFT;
+                       pm = pmd_offset(pu, ea);
+                       if (is_hugepd(pm))
+                               hpdp = (hugepd_t *)pm;
+                       else if (!pmd_none(*pm)) {
+                               return pte_offset_map(pm, ea);
+                       }
+               }
+       }
+
+       if (!hpdp)
+               return NULL;
+
+       if (shift)
+               *shift = hugepd_shift(*hpdp);
+       return hugepte_offset(hpdp, ea, pdshift);
+}
+
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+{
+       return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
+}
+
 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
-                          unsigned long address, unsigned int psize)
+                          unsigned long address, unsigned pdshift, unsigned pshift)
 {
-       pte_t *new = kmem_cache_zalloc(pgtable_cache[HUGE_PGTABLE_INDEX(psize)],
-                                     GFP_KERNEL|__GFP_REPEAT);
+       pte_t *new = kmem_cache_zalloc(PGT_CACHE(pdshift - pshift),
+                                      GFP_KERNEL|__GFP_REPEAT);
+
+       BUG_ON(pshift > HUGEPD_SHIFT_MASK);
+       BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK);
 
        if (! new)
                return -ENOMEM;
 
        spin_lock(&mm->page_table_lock);
        if (!hugepd_none(*hpdp))
-               kmem_cache_free(pgtable_cache[HUGE_PGTABLE_INDEX(psize)], new);
+               kmem_cache_free(PGT_CACHE(pdshift - pshift), new);
        else
-               hpdp->pd = (unsigned long)new | HUGEPD_OK;
+               hpdp->pd = ((unsigned long)new & ~0x8000000000000000) | pshift;
        spin_unlock(&mm->page_table_lock);
        return 0;
 }
 
-
-static pud_t *hpud_offset(pgd_t *pgd, unsigned long addr, struct hstate *hstate)
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
 {
-       if (huge_page_shift(hstate) < PUD_SHIFT)
-               return pud_offset(pgd, addr);
-       else
-               return (pud_t *) pgd;
-}
-static pud_t *hpud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr,
-                        struct hstate *hstate)
-{
-       if (huge_page_shift(hstate) < PUD_SHIFT)
-               return pud_alloc(mm, pgd, addr);
-       else
-               return (pud_t *) pgd;
-}
-static pmd_t *hpmd_offset(pud_t *pud, unsigned long addr, struct hstate *hstate)
-{
-       if (huge_page_shift(hstate) < PMD_SHIFT)
-               return pmd_offset(pud, addr);
-       else
-               return (pmd_t *) pud;
-}
-static pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr,
-                        struct hstate *hstate)
-{
-       if (huge_page_shift(hstate) < PMD_SHIFT)
-               return pmd_alloc(mm, pud, addr);
-       else
-               return (pmd_t *) pud;
+       pgd_t *pg;
+       pud_t *pu;
+       pmd_t *pm;
+       hugepd_t *hpdp = NULL;
+       unsigned pshift = __ffs(sz);
+       unsigned pdshift = PGDIR_SHIFT;
+
+       addr &= ~(sz-1);
+
+       pg = pgd_offset(mm, addr);
+       if (pshift >= PUD_SHIFT) {
+               hpdp = (hugepd_t *)pg;
+       } else {
+               pdshift = PUD_SHIFT;
+               pu = pud_alloc(mm, pg, addr);
+               if (pshift >= PMD_SHIFT) {
+                       hpdp = (hugepd_t *)pu;
+               } else {
+                       pdshift = PMD_SHIFT;
+                       pm = pmd_alloc(mm, pu, addr);
+                       hpdp = (hugepd_t *)pm;
+               }
+       }
+
+       if (!hpdp)
+               return NULL;
+
+       BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp));
+
+       if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift))
+               return NULL;
+
+       return hugepte_offset(hpdp, addr, pdshift);
 }
 
 /* Build list of addresses of gigantic pages.  This function is used in early
@@ -192,94 +199,38 @@ int alloc_bootmem_huge_page(struct hstate *hstate)
        return 1;
 }
 
-
-/* Modelled after find_linux_pte() */
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
-{
-       pgd_t *pg;
-       pud_t *pu;
-       pmd_t *pm;
-
-       unsigned int psize;
-       unsigned int shift;
-       unsigned long sz;
-       struct hstate *hstate;
-       psize = get_slice_psize(mm, addr);
-       shift = mmu_psize_to_shift(psize);
-       sz = ((1UL) << shift);
-       hstate = size_to_hstate(sz);
-
-       addr &= hstate->mask;
-
-       pg = pgd_offset(mm, addr);
-       if (!pgd_none(*pg)) {
-               pu = hpud_offset(pg, addr, hstate);
-               if (!pud_none(*pu)) {
-                       pm = hpmd_offset(pu, addr, hstate);
-                       if (!pmd_none(*pm))
-                               return hugepte_offset((hugepd_t *)pm, addr,
-                                                     hstate);
-               }
-       }
-
-       return NULL;
-}
-
-pte_t *huge_pte_alloc(struct mm_struct *mm,
-                       unsigned long addr, unsigned long sz)
-{
-       pgd_t *pg;
-       pud_t *pu;
-       pmd_t *pm;
-       hugepd_t *hpdp = NULL;
-       struct hstate *hstate;
-       unsigned int psize;
-       hstate = size_to_hstate(sz);
-
-       psize = get_slice_psize(mm, addr);
-       BUG_ON(!mmu_huge_psizes[psize]);
-
-       addr &= hstate->mask;
-
-       pg = pgd_offset(mm, addr);
-       pu = hpud_alloc(mm, pg, addr, hstate);
-
-       if (pu) {
-               pm = hpmd_alloc(mm, pu, addr, hstate);
-               if (pm)
-                       hpdp = (hugepd_t *)pm;
-       }
-
-       if (! hpdp)
-               return NULL;
-
-       if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, psize))
-               return NULL;
-
-       return hugepte_offset(hpdp, addr, hstate);
-}
-
 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 {
        return 0;
 }
 
-static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp,
-                              unsigned int psize)
+static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,
+                             unsigned long start, unsigned long end,
+                             unsigned long floor, unsigned long ceiling)
 {
        pte_t *hugepte = hugepd_page(*hpdp);
+       unsigned shift = hugepd_shift(*hpdp);
+       unsigned long pdmask = ~((1UL << pdshift) - 1);
+
+       start &= pdmask;
+       if (start < floor)
+               return;
+       if (ceiling) {
+               ceiling &= pdmask;
+               if (! ceiling)
+                       return;
+       }
+       if (end - 1 > ceiling - 1)
+               return;
 
        hpdp->pd = 0;
        tlb->need_flush = 1;
-       pgtable_free_tlb(tlb, pgtable_free_cache(hugepte,
-                                                HUGEPTE_CACHE_NUM+psize-1,
-                                                PGF_CACHENUM_MASK));
+       pgtable_free_tlb(tlb, hugepte, pdshift - shift);
 }
 
 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
                                   unsigned long addr, unsigned long end,
-                                  unsigned long floor, unsigned long ceiling,
-                                  unsigned int psize)
+                                  unsigned long floor, unsigned long ceiling)
 {
        pmd_t *pmd;
        unsigned long next;
@@ -291,7 +242,8 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
                next = pmd_addr_end(addr, end);
                if (pmd_none(*pmd))
                        continue;
-               free_hugepte_range(tlb, (hugepd_t *)pmd, psize);
+               free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT,
+                                 addr, next, floor, ceiling);
        } while (pmd++, addr = next, addr != end);
 
        start &= PUD_MASK;
@@ -317,23 +269,19 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
        pud_t *pud;
        unsigned long next;
        unsigned long start;
-       unsigned int shift;
-       unsigned int psize = get_slice_psize(tlb->mm, addr);
-       shift = mmu_psize_to_shift(psize);
 
        start = addr;
        pud = pud_offset(pgd, addr);
        do {
                next = pud_addr_end(addr, end);
-               if (shift < PMD_SHIFT) {
+               if (!is_hugepd(pud)) {
                        if (pud_none_or_clear_bad(pud))
                                continue;
                        hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
-                                              ceiling, psize);
+                                              ceiling);
                } else {
-                       if (pud_none(*pud))
-                               continue;
-                       free_hugepte_range(tlb, (hugepd_t *)pud, psize);
+                       free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT,
+                                         addr, next, floor, ceiling);
                }
        } while (pud++, addr = next, addr != end);
 
@@ -364,121 +312,56 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 {
        pgd_t *pgd;
        unsigned long next;
-       unsigned long start;
 
        /*
-        * Comments below take from the normal free_pgd_range().  They
-        * apply here too.  The tests against HUGEPD_MASK below are
-        * essential, because we *don't* test for this at the bottom
-        * level.  Without them we'll attempt to free a hugepte table
-        * when we unmap just part of it, even if there are other
-        * active mappings using it.
-        *
-        * The next few lines have given us lots of grief...
+        * Because there are a number of different possible pagetable
+        * layouts for hugepage ranges, we limit knowledge of how
+        * things should be laid out to the allocation path
+        * (huge_pte_alloc(), above).  Everything else works out the
+        * structure as it goes from information in the hugepd
+        * pointers.  That means that we can't here use the
+        * optimization used in the normal page free_pgd_range(), of
+        * checking whether we're actually covering a large enough
+        * range to have to do anything at the top level of the walk
+        * instead of at the bottom.
         *
-        * Why are we testing HUGEPD* at this top level?  Because
-        * often there will be no work to do at all, and we'd prefer
-        * not to go all the way down to the bottom just to discover
-        * that.
-        *
-        * Why all these "- 1"s?  Because 0 represents both the bottom
-        * of the address space and the top of it (using -1 for the
-        * top wouldn't help much: the masks would do the wrong thing).
-        * The rule is that addr 0 and floor 0 refer to the bottom of
-        * the address space, but end 0 and ceiling 0 refer to the top
-        * Comparisons need to use "end - 1" and "ceiling - 1" (though
-        * that end 0 case should be mythical).
-        *
-        * Wherever addr is brought up or ceiling brought down, we
-        * must be careful to reject "the opposite 0" before it
-        * confuses the subsequent tests.  But what about where end is
-        * brought down by HUGEPD_SIZE below? no, end can't go down to
-        * 0 there.
-        *
-        * Whereas we round start (addr) and ceiling down, by different
-        * masks at different levels, in order to test whether a table
-        * now has no other vmas using it, so can be freed, we don't
-        * bother to round floor or end up - the tests don't need that.
+        * To make sense of this, you should probably go read the big
+        * block comment at the top of the normal free_pgd_range(),
+        * too.
         */
-       unsigned int psize = get_slice_psize(tlb->mm, addr);
-
-       addr &= HUGEPD_MASK(psize);
-       if (addr < floor) {
-               addr += HUGEPD_SIZE(psize);
-               if (!addr)
-                       return;
-       }
-       if (ceiling) {
-               ceiling &= HUGEPD_MASK(psize);
-               if (!ceiling)
-                       return;
-       }
-       if (end - 1 > ceiling - 1)
-               end -= HUGEPD_SIZE(psize);
-       if (addr > end - 1)
-               return;
 
-       start = addr;
        pgd = pgd_offset(tlb->mm, addr);
        do {
-               psize = get_slice_psize(tlb->mm, addr);
-               BUG_ON(!mmu_huge_psizes[psize]);
                next = pgd_addr_end(addr, end);
-               if (mmu_psize_to_shift(psize) < PUD_SHIFT) {
+               if (!is_hugepd(pgd)) {
                        if (pgd_none_or_clear_bad(pgd))
                                continue;
                        hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);
                } else {
-                       if (pgd_none(*pgd))
-                               continue;
-                       free_hugepte_range(tlb, (hugepd_t *)pgd, psize);
+                       free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,
+                                         addr, next, floor, ceiling);
                }
        } while (pgd++, addr = next, addr != end);
 }
 
-void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
-                    pte_t *ptep, pte_t pte)
-{
-       if (pte_present(*ptep)) {
-               /* We open-code pte_clear because we need to pass the right
-                * argument to hpte_need_flush (huge / !huge). Might not be
-                * necessary anymore if we make hpte_need_flush() get the
-                * page size from the slices
-                */
-               unsigned int psize = get_slice_psize(mm, addr);
-               unsigned int shift = mmu_psize_to_shift(psize);
-               unsigned long sz = ((1UL) << shift);
-               struct hstate *hstate = size_to_hstate(sz);
-               pte_update(mm, addr & hstate->mask, ptep, ~0UL, 1);
-       }
-       *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
-}
-
-pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
-                             pte_t *ptep)
-{
-       unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1);
-       return __pte(old);
-}
-
 struct page *
 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 {
        pte_t *ptep;
        struct page *page;
-       unsigned int mmu_psize = get_slice_psize(mm, address);
+       unsigned shift;
+       unsigned long mask;
+
+       ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
 
        /* Verify it is a huge page else bail. */
-       if (!mmu_huge_psizes[mmu_psize])
+       if (!ptep || !shift)
                return ERR_PTR(-EINVAL);
 
-       ptep = huge_pte_offset(mm, address);
+       mask = (1UL << shift) - 1;
        page = pte_page(*ptep);
-       if (page) {
-               unsigned int shift = mmu_psize_to_shift(mmu_psize);
-               unsigned long sz = ((1UL) << shift);
-               page += (address % sz) / PAGE_SIZE;
-       }
+       if (page)
+               page += (address & mask) / PAGE_SIZE;
 
        return page;
 }
@@ -501,6 +384,73 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
        return NULL;
 }
 
+static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
+                      unsigned long end, int write, struct page **pages, int *nr)
+{
+       unsigned long mask;
+       unsigned long pte_end;
+       struct page *head, *page;
+       pte_t pte;
+       int refs;
+
+       pte_end = (addr + sz) & ~(sz-1);
+       if (pte_end < end)
+               end = pte_end;
+
+       pte = *ptep;
+       mask = _PAGE_PRESENT | _PAGE_USER;
+       if (write)
+               mask |= _PAGE_RW;
+
+       if ((pte_val(pte) & mask) != mask)
+               return 0;
+
+       /* hugepages are never "special" */
+       VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+       refs = 0;
+       head = pte_page(pte);
+
+       page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
+       do {
+               VM_BUG_ON(compound_head(page) != head);
+               pages[*nr] = page;
+               (*nr)++;
+               page++;
+               refs++;
+       } while (addr += PAGE_SIZE, addr != end);
+
+       if (!page_cache_add_speculative(head, refs)) {
+               *nr -= refs;
+               return 0;
+       }
+
+       if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+               /* Could be optimized better */
+               while (*nr) {
+                       put_page(page);
+                       (*nr)--;
+               }
+       }
+
+       return 1;
+}
+
+int gup_hugepd(hugepd_t *hugepd, unsigned pdshift,
+              unsigned long addr, unsigned long end,
+              int write, struct page **pages, int *nr)
+{
+       pte_t *ptep;
+       unsigned long sz = 1UL << hugepd_shift(*hugepd);
+
+       ptep = hugepte_offset(hugepd, addr, pdshift);
+       do {
+               if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr))
+                       return 0;
+       } while (ptep++, addr += sz, addr != end);
+
+       return 1;
+}
 
 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
                                        unsigned long len, unsigned long pgoff,
@@ -509,8 +459,6 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
        struct hstate *hstate = hstate_file(file);
        int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));
 
-       if (!mmu_huge_psizes[mmu_psize])
-               return -EINVAL;
        return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0);
 }
 
@@ -521,229 +469,46 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
        return 1UL << mmu_psize_to_shift(psize);
 }
 
-/*
- * Called by asm hashtable.S for doing lazy icache flush
- */
-static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags,
-                                       pte_t pte, int trap, unsigned long sz)
+static int __init add_huge_page_size(unsigned long long size)
 {
-       struct page *page;
-       int i;
-
-       if (!pfn_valid(pte_pfn(pte)))
-               return rflags;
-
-       page = pte_page(pte);
-
-       /* page is dirty */
-       if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) {
-               if (trap == 0x400) {
-                       for (i = 0; i < (sz / PAGE_SIZE); i++)
-                               __flush_dcache_icache(page_address(page+i));
-                       set_bit(PG_arch_1, &page->flags);
-               } else {
-                       rflags |= HPTE_R_N;
-               }
-       }
-       return rflags;
-}
+       int shift = __ffs(size);
+       int mmu_psize;
 
-int hash_huge_page(struct mm_struct *mm, unsigned long access,
-                  unsigned long ea, unsigned long vsid, int local,
-                  unsigned long trap)
-{
-       pte_t *ptep;
-       unsigned long old_pte, new_pte;
-       unsigned long va, rflags, pa, sz;
-       long slot;
-       int err = 1;
-       int ssize = user_segment_size(ea);
-       unsigned int mmu_psize;
-       int shift;
-       mmu_psize = get_slice_psize(mm, ea);
-
-       if (!mmu_huge_psizes[mmu_psize])
-               goto out;
-       ptep = huge_pte_offset(mm, ea);
-
-       /* Search the Linux page table for a match with va */
-       va = hpt_va(ea, vsid, ssize);
+       /* Check that it is a page size supported by the hardware and
+        * that it fits within pagetable and slice limits. */
+       if (!is_power_of_2(size)
+           || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT))
+               return -EINVAL;
 
-       /*
-        * If no pte found or not present, send the problem up to
-        * do_page_fault
-        */
-       if (unlikely(!ptep || pte_none(*ptep)))
-               goto out;
+       if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
+               return -EINVAL;
 
-       /* 
-        * Check the user's access rights to the page.  If access should be
-        * prevented then send the problem up to do_page_fault.
-        */
-       if (unlikely(access & ~pte_val(*ptep)))
-               goto out;
-       /*
-        * At this point, we have a pte (old_pte) which can be used to build
-        * or update an HPTE. There are 2 cases:
-        *
-        * 1. There is a valid (present) pte with no associated HPTE (this is 
-        *      the most common case)
-        * 2. There is a valid (present) pte with an associated HPTE. The
-        *      current values of the pp bits in the HPTE prevent access
-        *      because we are doing software DIRTY bit management and the
-        *      page is currently not DIRTY. 
+#ifdef CONFIG_SPU_FS_64K_LS
+       /* Disable support for 64K huge pages when 64K SPU local store
+        * support is enabled as the current implementation conflicts.
         */
+       if (shift == PAGE_SHIFT_64K)
+               return -EINVAL;
+#endif /* CONFIG_SPU_FS_64K_LS */
 
+       BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
 
-       do {
-               old_pte = pte_val(*ptep);
-               if (old_pte & _PAGE_BUSY)
-                       goto out;
-               new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
-       } while(old_pte != __cmpxchg_u64((unsigned long *)ptep,
-                                        old_pte, new_pte));
-
-       rflags = 0x2 | (!(new_pte & _PAGE_RW));
-       /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
-       rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N);
-       shift = mmu_psize_to_shift(mmu_psize);
-       sz = ((1UL) << shift);
-       if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
-               /* No CPU has hugepages but lacks no execute, so we
-                * don't need to worry about that case */
-               rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte),
-                                                      trap, sz);
-
-       /* Check if pte already has an hpte (case 2) */
-       if (unlikely(old_pte & _PAGE_HASHPTE)) {
-               /* There MIGHT be an HPTE for this pte */
-               unsigned long hash, slot;
-
-               hash = hpt_hash(va, shift, ssize);
-               if (old_pte & _PAGE_F_SECOND)
-                       hash = ~hash;
-               slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-               slot += (old_pte & _PAGE_F_GIX) >> 12;
-
-               if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize,
-                                        ssize, local) == -1)
-                       old_pte &= ~_PAGE_HPTEFLAGS;
-       }
-
-       if (likely(!(old_pte & _PAGE_HASHPTE))) {
-               unsigned long hash = hpt_hash(va, shift, ssize);
-               unsigned long hpte_group;
-
-               pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT;
-
-repeat:
-               hpte_group = ((hash & htab_hash_mask) *
-                             HPTES_PER_GROUP) & ~0x7UL;
-
-               /* clear HPTE slot informations in new PTE */
-#ifdef CONFIG_PPC_64K_PAGES
-               new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0;
-#else
-               new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE;
-#endif
-               /* Add in WIMG bits */
-               rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
-                                     _PAGE_COHERENT | _PAGE_GUARDED));
-
-               /* Insert into the hash table, primary slot */
-               slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0,
-                                         mmu_psize, ssize);
-
-               /* Primary is full, try the secondary */
-               if (unlikely(slot == -1)) {
-                       hpte_group = ((~hash & htab_hash_mask) *
-                                     HPTES_PER_GROUP) & ~0x7UL; 
-                       slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags,
-                                                 HPTE_V_SECONDARY,
-                                                 mmu_psize, ssize);
-                       if (slot == -1) {
-                               if (mftb() & 0x1)
-                                       hpte_group = ((hash & htab_hash_mask) *
-                                                     HPTES_PER_GROUP)&~0x7UL;
-
-                               ppc_md.hpte_remove(hpte_group);
-                               goto repeat;
-                        }
-               }
-
-               if (unlikely(slot == -2))
-                       panic("hash_huge_page: pte_insert failed\n");
-
-               new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
-       }
-
-       /*
-        * No need to use ldarx/stdcx here
-        */
-       *ptep = __pte(new_pte & ~_PAGE_BUSY);
+       /* Return if huge page size has already been setup */
+       if (size_to_hstate(size))
+               return 0;
 
-       err = 0;
+       hugetlb_add_hstate(shift - PAGE_SHIFT);
 
- out:
-       return err;
-}
-
-static void __init set_huge_psize(int psize)
-{
-       /* Check that it is a page size supported by the hardware and
-        * that it fits within pagetable limits. */
-       if (mmu_psize_defs[psize].shift &&
-               mmu_psize_defs[psize].shift < SID_SHIFT_1T &&
-               (mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT ||
-                mmu_psize_defs[psize].shift == PAGE_SHIFT_64K ||
-                mmu_psize_defs[psize].shift == PAGE_SHIFT_16G)) {
-               /* Return if huge page size has already been setup or is the
-                * same as the base page size. */
-               if (mmu_huge_psizes[psize] ||
-                  mmu_psize_defs[psize].shift == PAGE_SHIFT)
-                       return;
-               if (WARN_ON(HUGEPTE_CACHE_NAME(psize) == NULL))
-                       return;
-               hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT);
-
-               switch (mmu_psize_defs[psize].shift) {
-               case PAGE_SHIFT_64K:
-                   /* We only allow 64k hpages with 4k base page,
-                    * which was checked above, and always put them
-                    * at the PMD */
-                   hugepte_shift[psize] = PMD_SHIFT;
-                   break;
-               case PAGE_SHIFT_16M:
-                   /* 16M pages can be at two different levels
-                    * of pagestables based on base page size */
-                   if (PAGE_SHIFT == PAGE_SHIFT_64K)
-                           hugepte_shift[psize] = PMD_SHIFT;
-                   else /* 4k base page */
-                           hugepte_shift[psize] = PUD_SHIFT;
-                   break;
-               case PAGE_SHIFT_16G:
-                   /* 16G pages are always at PGD level */
-                   hugepte_shift[psize] = PGDIR_SHIFT;
-                   break;
-               }
-               hugepte_shift[psize] -= mmu_psize_defs[psize].shift;
-       } else
-               hugepte_shift[psize] = 0;
+       return 0;
 }
 
 static int __init hugepage_setup_sz(char *str)
 {
        unsigned long long size;
-       int mmu_psize;
-       int shift;
 
        size = memparse(str, &str);
 
-       shift = __ffs(size);
-       mmu_psize = shift_to_mmu_psize(shift);
-       if (mmu_psize >= 0 && mmu_psize_defs[mmu_psize].shift)
-               set_huge_psize(mmu_psize);
-       else
+       if (add_huge_page_size(size) != 0)
                printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size);
 
        return 1;
@@ -752,41 +517,55 @@ __setup("hugepagesz=", hugepage_setup_sz);
 
 static int __init hugetlbpage_init(void)
 {
-       unsigned int psize;
+       int psize;
 
        if (!cpu_has_feature(CPU_FTR_16M_PAGE))
                return -ENODEV;
 
-       /* Add supported huge page sizes.  Need to change HUGE_MAX_HSTATE
-        * and adjust PTE_NONCACHE_NUM if the number of supported huge page
-        * sizes changes.
-        */
-       set_huge_psize(MMU_PAGE_16M);
-       set_huge_psize(MMU_PAGE_16G);
+       for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+               unsigned shift;
+               unsigned pdshift;
 
-       /* Temporarily disable support for 64K huge pages when 64K SPU local
-        * store support is enabled as the current implementation conflicts.
-        */
-#ifndef CONFIG_SPU_FS_64K_LS
-       set_huge_psize(MMU_PAGE_64K);
-#endif
+               if (!mmu_psize_defs[psize].shift)
+                       continue;
 
-       for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
-               if (mmu_huge_psizes[psize]) {
-                       pgtable_cache[HUGE_PGTABLE_INDEX(psize)] =
-                               kmem_cache_create(
-                                       HUGEPTE_CACHE_NAME(psize),
-                                       HUGEPTE_TABLE_SIZE(psize),
-                                       HUGEPTE_TABLE_SIZE(psize),
-                                       0,
-                                       NULL);
-                       if (!pgtable_cache[HUGE_PGTABLE_INDEX(psize)])
-                               panic("hugetlbpage_init(): could not create %s"\
-                                     "\n", HUGEPTE_CACHE_NAME(psize));
-               }
+               shift = mmu_psize_to_shift(psize);
+
+               if (add_huge_page_size(1ULL << shift) < 0)
+                       continue;
+
+               if (shift < PMD_SHIFT)
+                       pdshift = PMD_SHIFT;
+               else if (shift < PUD_SHIFT)
+                       pdshift = PUD_SHIFT;
+               else
+                       pdshift = PGDIR_SHIFT;
+
+               pgtable_cache_add(pdshift - shift, NULL);
+               if (!PGT_CACHE(pdshift - shift))
+                       panic("hugetlbpage_init(): could not create "
+                             "pgtable cache for %d bit pagesize\n", shift);
        }
 
+       /* Set default large page size. Currently, we pick 16M or 1M
+        * depending on what is available
+        */
+       if (mmu_psize_defs[MMU_PAGE_16M].shift)
+               HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift;
+       else if (mmu_psize_defs[MMU_PAGE_1M].shift)
+               HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift;
+
        return 0;
 }
 
 module_init(hugetlbpage_init);
+
+void flush_dcache_icache_hugepage(struct page *page)
+{
+       int i;
+
+       BUG_ON(!PageCompound(page));
+
+       for (i = 0; i < (1UL << compound_order(page)); i++)
+               __flush_dcache_icache(page_address(page+i));
+}
index 335c578b9cc324261a29a3a435036ef5178e9f96..776f28d02b6b0a163a6c5f8d6bde6532a350d38c 100644 (file)
@@ -41,6 +41,7 @@
 #include <linux/module.h>
 #include <linux/poison.h>
 #include <linux/lmb.h>
+#include <linux/hugetlb.h>
 
 #include <asm/pgalloc.h>
 #include <asm/page.h>
@@ -119,30 +120,63 @@ static void pmd_ctor(void *addr)
        memset(addr, 0, PMD_TABLE_SIZE);
 }
 
-static const unsigned int pgtable_cache_size[2] = {
-       PGD_TABLE_SIZE, PMD_TABLE_SIZE
-};
-static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
-#ifdef CONFIG_PPC_64K_PAGES
-       "pgd_cache", "pmd_cache",
-#else
-       "pgd_cache", "pud_pmd_cache",
-#endif /* CONFIG_PPC_64K_PAGES */
-};
-
-#ifdef CONFIG_HUGETLB_PAGE
-/* Hugepages need an extra cache per hugepagesize, initialized in
- * hugetlbpage.c.  We can't put into the tables above, because HPAGE_SHIFT
- * is not compile time constant. */
-struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+MMU_PAGE_COUNT];
-#else
-struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
-#endif
+struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE];
+
+/*
+ * Create a kmem_cache() for pagetables.  This is not used for PTE
+ * pages - they're linked to struct page, come from the normal free
+ * pages pool and have a different entry size (see real_pte_t) to
+ * everything else.  Caches created by this function are used for all
+ * the higher level pagetables, and for hugepage pagetables.
+ */
+void pgtable_cache_add(unsigned shift, void (*ctor)(void *))
+{
+       char *name;
+       unsigned long table_size = sizeof(void *) << shift;
+       unsigned long align = table_size;
+
+       /* When batching pgtable pointers for RCU freeing, we store
+        * the index size in the low bits.  Table alignment must be
+        * big enough to fit it.
+        *
+        * Likewise, hugeapge pagetable pointers contain a (different)
+        * shift value in the low bits.  All tables must be aligned so
+        * as to leave enough 0 bits in the address to contain it. */
+       unsigned long minalign = max(MAX_PGTABLE_INDEX_SIZE + 1,
+                                    HUGEPD_SHIFT_MASK + 1);
+       struct kmem_cache *new;
+
+       /* It would be nice if this was a BUILD_BUG_ON(), but at the
+        * moment, gcc doesn't seem to recognize is_power_of_2 as a
+        * constant expression, so so much for that. */
+       BUG_ON(!is_power_of_2(minalign));
+       BUG_ON((shift < 1) || (shift > MAX_PGTABLE_INDEX_SIZE));
+
+       if (PGT_CACHE(shift))
+               return; /* Already have a cache of this size */
+
+       align = max_t(unsigned long, align, minalign);
+       name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift);
+       new = kmem_cache_create(name, table_size, align, 0, ctor);
+       PGT_CACHE(shift) = new;
+
+       pr_debug("Allocated pgtable cache for order %d\n", shift);
+}
+
 
 void pgtable_cache_init(void)
 {
-       pgtable_cache[0] = kmem_cache_create(pgtable_cache_name[0], PGD_TABLE_SIZE, PGD_TABLE_SIZE, SLAB_PANIC, pgd_ctor);
-       pgtable_cache[1] = kmem_cache_create(pgtable_cache_name[1], PMD_TABLE_SIZE, PMD_TABLE_SIZE, SLAB_PANIC, pmd_ctor);
+       pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
+       pgtable_cache_add(PMD_INDEX_SIZE, pmd_ctor);
+       if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_INDEX_SIZE))
+               panic("Couldn't allocate pgtable caches");
+
+       /* In all current configs, when the PUD index exists it's the
+        * same size as either the pgd or pmd index.  Verify that the
+        * initialization above has also created a PUD cache.  This
+        * will need re-examiniation if we add new possibilities for
+        * the pagetable layout. */
+       BUG_ON(PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE));
 }
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
index 59736317bf0e87a1193910340dd338fb5e75c2ff..b9b152558f9c5de64d70399a37c1aa7dfd001706 100644 (file)
@@ -32,6 +32,7 @@
 #include <linux/pagemap.h>
 #include <linux/suspend.h>
 #include <linux/lmb.h>
+#include <linux/hugetlb.h>
 
 #include <asm/pgalloc.h>
 #include <asm/prom.h>
@@ -417,18 +418,26 @@ EXPORT_SYMBOL(flush_dcache_page);
 
 void flush_dcache_icache_page(struct page *page)
 {
+#ifdef CONFIG_HUGETLB_PAGE
+       if (PageCompound(page)) {
+               flush_dcache_icache_hugepage(page);
+               return;
+       }
+#endif
 #ifdef CONFIG_BOOKE
-       void *start = kmap_atomic(page, KM_PPC_SYNC_ICACHE);
-       __flush_dcache_icache(start);
-       kunmap_atomic(start, KM_PPC_SYNC_ICACHE);
+       {
+               void *start = kmap_atomic(page, KM_PPC_SYNC_ICACHE);
+               __flush_dcache_icache(start);
+               kunmap_atomic(start, KM_PPC_SYNC_ICACHE);
+       }
 #elif defined(CONFIG_8xx) || defined(CONFIG_PPC64)
        /* On 8xx there is no need to kmap since highmem is not supported */
        __flush_dcache_icache(page_address(page)); 
 #else
        __flush_dcache_icache_phys(page_to_pfn(page) << PAGE_SHIFT);
 #endif
-
 }
+
 void clear_user_page(void *page, unsigned long vaddr, struct page *pg)
 {
        clear_page(page);
index dbeb86ac90cd79f01396eecbd72b98bc172253dc..b9e4cc2c2057e43bed991c06c4c9cc184bc6f3d2 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/mm.h>
 #include <linux/spinlock.h>
 #include <linux/idr.h>
+#include <linux/module.h>
 
 #include <asm/mmu_context.h>
 
@@ -32,7 +33,7 @@ static DEFINE_IDR(mmu_context_idr);
 #define NO_CONTEXT     0
 #define MAX_CONTEXT    ((1UL << 19) - 1)
 
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+int __init_new_context(void)
 {
        int index;
        int err;
@@ -57,6 +58,18 @@ again:
                return -ENOMEM;
        }
 
+       return index;
+}
+EXPORT_SYMBOL_GPL(__init_new_context);
+
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+       int index;
+
+       index = __init_new_context();
+       if (index < 0)
+               return index;
+
        /* The old code would re-promote on fork, we don't do that
         * when using slices as it could cause problem promoting slices
         * that have been forced down to 4K
@@ -68,11 +81,16 @@ again:
        return 0;
 }
 
-void destroy_context(struct mm_struct *mm)
+void __destroy_context(int context_id)
 {
        spin_lock(&mmu_context_lock);
-       idr_remove(&mmu_context_idr, mm->context.id);
+       idr_remove(&mmu_context_idr, context_id);
        spin_unlock(&mmu_context_lock);
+}
+EXPORT_SYMBOL_GPL(__destroy_context);
 
+void destroy_context(struct mm_struct *mm)
+{
+       __destroy_context(mm->context.id);
        mm->context.id = NO_CONTEXT;
 }
index 53040931de322c5cf0df1955ad5aecd876b0f420..99df697c601acbd9a0fa3267cb313e7a21cd7303 100644 (file)
@@ -49,12 +49,12 @@ struct pte_freelist_batch
 {
        struct rcu_head rcu;
        unsigned int    index;
-       pgtable_free_t  tables[0];
+       unsigned long   tables[0];
 };
 
 #define PTE_FREELIST_SIZE \
        ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \
-         / sizeof(pgtable_free_t))
+         / sizeof(unsigned long))
 
 static void pte_free_smp_sync(void *arg)
 {
@@ -64,13 +64,13 @@ static void pte_free_smp_sync(void *arg)
 /* This is only called when we are critically out of memory
  * (and fail to get a page in pte_free_tlb).
  */
-static void pgtable_free_now(pgtable_free_t pgf)
+static void pgtable_free_now(void *table, unsigned shift)
 {
        pte_freelist_forced_free++;
 
        smp_call_function(pte_free_smp_sync, NULL, 1);
 
-       pgtable_free(pgf);
+       pgtable_free(table, shift);
 }
 
 static void pte_free_rcu_callback(struct rcu_head *head)
@@ -79,8 +79,12 @@ static void pte_free_rcu_callback(struct rcu_head *head)
                container_of(head, struct pte_freelist_batch, rcu);
        unsigned int i;
 
-       for (i = 0; i < batch->index; i++)
-               pgtable_free(batch->tables[i]);
+       for (i = 0; i < batch->index; i++) {
+               void *table = (void *)(batch->tables[i] & ~MAX_PGTABLE_INDEX_SIZE);
+               unsigned shift = batch->tables[i] & MAX_PGTABLE_INDEX_SIZE;
+
+               pgtable_free(table, shift);
+       }
 
        free_page((unsigned long)batch);
 }
@@ -91,25 +95,28 @@ static void pte_free_submit(struct pte_freelist_batch *batch)
        call_rcu(&batch->rcu, pte_free_rcu_callback);
 }
 
-void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
+void pgtable_free_tlb(struct mmu_gather *tlb, void *table, unsigned shift)
 {
        /* This is safe since tlb_gather_mmu has disabled preemption */
        struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
+       unsigned long pgf;
 
        if (atomic_read(&tlb->mm->mm_users) < 2 ||
            cpumask_equal(mm_cpumask(tlb->mm), cpumask_of(smp_processor_id()))){
-               pgtable_free(pgf);
+               pgtable_free(table, shift);
                return;
        }
 
        if (*batchp == NULL) {
                *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC);
                if (*batchp == NULL) {
-                       pgtable_free_now(pgf);
+                       pgtable_free_now(table, shift);
                        return;
                }
                (*batchp)->index = 0;
        }
+       BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE);
+       pgf = (unsigned long)table | shift;
        (*batchp)->tables[(*batchp)->index++] = pgf;
        if ((*batchp)->index == PTE_FREELIST_SIZE) {
                pte_free_submit(*batchp);
index 2b2f35f6985e561aaacf2ef515e654d0ba1985b4..282d9306361f58f3ba3443648f73dd6955a75fd4 100644 (file)
@@ -53,11 +53,6 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
 
        i = batch->index;
 
-       /* We mask the address for the base page size. Huge pages will
-        * have applied their own masking already
-        */
-       addr &= PAGE_MASK;
-
        /* Get page size (maybe move back to caller).
         *
         * NOTE: when using special 64K mappings in 4K environment like
@@ -75,6 +70,9 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
        } else
                psize = pte_pagesize_index(mm, addr, pte);
 
+       /* Mask the address for the correct page size */
+       addr &= ~((1UL << mmu_psize_defs[psize].shift) - 1);
+
        /* Build full vaddr */
        if (!is_kernel_addr(addr)) {
                ssize = user_segment_size(addr);
index a6ce805666253d938fbfc986127d24ace67e3fda..cd70ee1667fae6ca178481a5f86a54b7cfc555ad 100644 (file)
@@ -132,7 +132,7 @@ static int
 cpld_pic_host_map(struct irq_host *h, unsigned int virq,
                             irq_hw_number_t hw)
 {
-       get_irq_desc(virq)->status |= IRQ_LEVEL;
+       irq_to_desc(virq)->status |= IRQ_LEVEL;
        set_irq_chip_and_handler(virq, &cpld_pic, handle_level_irq);
        return 0;
 }
index 68e4f1696d1488a663ef2480bae31fabd65aba32..47802035874838a14dbfa7952ea525125543c0ed 100644 (file)
@@ -114,7 +114,7 @@ void media5200_irq_cascade(unsigned int virq, struct irq_desc *desc)
 static int media5200_irq_map(struct irq_host *h, unsigned int virq,
                             irq_hw_number_t hw)
 {
-       struct irq_desc *desc = get_irq_desc(virq);
+       struct irq_desc *desc = irq_to_desc(virq);
 
        pr_debug("%s: h=%p, virq=%i, hwirq=%i\n", __func__, h, virq, (int)hw);
        set_irq_chip_data(virq, &media5200_irq);
index 7ee979f323d18c2da64e8597973bba0823ed1e8b..a682331ba0ff3d0a3e04db0054f68ad72c691e75 100644 (file)
@@ -107,7 +107,7 @@ static void pq2ads_pci_irq_demux(unsigned int irq, struct irq_desc *desc)
 static int pci_pic_host_map(struct irq_host *h, unsigned int virq,
                            irq_hw_number_t hw)
 {
-       get_irq_desc(virq)->status |= IRQ_LEVEL;
+       irq_to_desc(virq)->status |= IRQ_LEVEL;
        set_irq_chip_data(virq, h->host_data);
        set_irq_chip_and_handler(virq, &pq2ads_pci_ic, handle_level_irq);
        return 0;
index 60edf63d0157fbab56b6cdc473a15dfe01af3c9c..e59920aa66682594e3e9122d9eb5979e76b56faa 100644 (file)
@@ -245,7 +245,7 @@ static int socrates_fpga_pic_host_map(struct irq_host *h, unsigned int virq,
                irq_hw_number_t hwirq)
 {
        /* All interrupts are LEVEL sensitive */
-       get_irq_desc(virq)->status |= IRQ_LEVEL;
+       irq_to_desc(virq)->status |= IRQ_LEVEL;
        set_irq_chip_and_handler(virq, &socrates_fpga_pic_chip,
                        handle_fasteoi_irq);
 
index 50d0a2b6380954ac253707dd6c3cffc8d4465f4a..978d6cb37516ca6e9c5e14e0f36c45ba76a9c5d5 100644 (file)
@@ -163,7 +163,7 @@ static int gef_pic_host_map(struct irq_host *h, unsigned int virq,
                          irq_hw_number_t hwirq)
 {
        /* All interrupts are LEVEL sensitive */
-       get_irq_desc(virq)->status |= IRQ_LEVEL;
+       irq_to_desc(virq)->status |= IRQ_LEVEL;
        set_irq_chip_and_handler(virq, &gef_pic_chip, handle_level_irq);
 
        return 0;
index 385acfc48397b8f2d2577326b78b49bb68b100f1..242954c4293f64fcc6222a15de3eb6df6e8c0941 100644 (file)
@@ -222,7 +222,7 @@ static void cpm_cascade(unsigned int irq, struct irq_desc *desc)
        int cascade_irq;
 
        if ((cascade_irq = cpm_get_irq()) >= 0) {
-               struct irq_desc *cdesc = irq_desc + cascade_irq;
+               struct irq_desc *cdesc = irq_to_desc(cascade_irq);
 
                generic_handle_irq(cascade_irq);
                cdesc->chip->eoi(cascade_irq);
index 04a8061045c4feb3991945be7d619f849ea8320c..56bf12692f37285dff2e81f8d813bc558574a0e7 100644 (file)
@@ -86,6 +86,11 @@ config RTAS_ERROR_LOGGING
        depends on PPC_RTAS
        default n
 
+config PPC_RTAS_DAEMON
+       bool
+       depends on PPC_RTAS
+       default n
+
 config RTAS_PROC
        bool "Proc interface to RTAS"
        depends on PPC_RTAS
index 72254848a228e86923e19310915091a3cdce632b..4a2bbff5769813597ecd1ead6428089db6969171 100644 (file)
@@ -136,7 +136,7 @@ static void beatic_pic_host_unmap(struct irq_host *h, unsigned int virq)
 static int beatic_pic_host_map(struct irq_host *h, unsigned int virq,
                               irq_hw_number_t hw)
 {
-       struct irq_desc *desc = get_irq_desc(virq);
+       struct irq_desc *desc = irq_to_desc(virq);
        int64_t err;
 
        err = beat_construct_and_connect_irq_plug(virq, hw);
index 4e5655624ae877f1e0c8de8eaac716fba9db4f21..9dd63c5d11a8102b43a7bb865b6d38feb96c8038 100644 (file)
@@ -102,7 +102,7 @@ static void spider_ack_irq(unsigned int virq)
 
        /* Reset edge detection logic if necessary
         */
-       if (get_irq_desc(virq)->status & IRQ_LEVEL)
+       if (irq_to_desc(virq)->status & IRQ_LEVEL)
                return;
 
        /* Only interrupts 47 to 50 can be set to edge */
@@ -119,7 +119,7 @@ static int spider_set_irq_type(unsigned int virq, unsigned int type)
        struct spider_pic *pic = spider_virq_to_pic(virq);
        unsigned int hw = irq_map[virq].hwirq;
        void __iomem *cfg = spider_get_irq_config(pic, hw);
-       struct irq_desc *desc = get_irq_desc(virq);
+       struct irq_desc *desc = irq_to_desc(virq);
        u32 old_mask;
        u32 ic;
 
index 37d438bd5b7a10f29a9b40eafc8f2edf1cc5ffe1..bc0b0efdc5fe81c3969349d2814ebe3a665484b9 100644 (file)
@@ -5,6 +5,8 @@ config PPC_CHRP
        select PPC_I8259
        select PPC_INDIRECT_PCI
        select PPC_RTAS
+       select PPC_RTAS_DAEMON
+       select RTAS_ERROR_LOGGING
        select PPC_MPC106
        select PPC_UDBG_16550
        select PPC_NATIVE
index cd4ad9aea760d966505a977d9d92e5a7f1f33e38..52f3df3b4ca0a6a196e1712e6038a303661b7aed 100644 (file)
@@ -364,19 +364,6 @@ void __init chrp_setup_arch(void)
        if (ppc_md.progress) ppc_md.progress("Linux/PPC "UTS_RELEASE"\n", 0x0);
 }
 
-void
-chrp_event_scan(unsigned long unused)
-{
-       unsigned char log[1024];
-       int ret = 0;
-
-       /* XXX: we should loop until the hardware says no more error logs -- Cort */
-       rtas_call(rtas_token("event-scan"), 4, 1, &ret, 0xffffffff, 0,
-                 __pa(log), 1024);
-       mod_timer(&__get_cpu_var(heartbeat_timer),
-                 jiffies + event_scan_interval);
-}
-
 static void chrp_8259_cascade(unsigned int irq, struct irq_desc *desc)
 {
        unsigned int cascade_irq = i8259_irq();
@@ -568,9 +555,6 @@ void __init chrp_init_IRQ(void)
 void __init
 chrp_init2(void)
 {
-       struct device_node *device;
-       const unsigned int *p = NULL;
-
 #ifdef CONFIG_NVRAM
        chrp_nvram_init();
 #endif
@@ -582,40 +566,6 @@ chrp_init2(void)
        request_region(0x80,0x10,"dma page reg");
        request_region(0xc0,0x20,"dma2");
 
-       /* Get the event scan rate for the rtas so we know how
-        * often it expects a heartbeat. -- Cort
-        */
-       device = of_find_node_by_name(NULL, "rtas");
-       if (device)
-               p = of_get_property(device, "rtas-event-scan-rate", NULL);
-       if (p && *p) {
-               /*
-                * Arrange to call chrp_event_scan at least *p times
-                * per minute.  We use 59 rather than 60 here so that
-                * the rate will be slightly higher than the minimum.
-                * This all assumes we don't do hotplug CPU on any
-                * machine that needs the event scans done.
-                */
-               unsigned long interval, offset;
-               int cpu, ncpus;
-               struct timer_list *timer;
-
-               interval = HZ * 59 / *p;
-               offset = HZ;
-               ncpus = num_online_cpus();
-               event_scan_interval = ncpus * interval;
-               for (cpu = 0; cpu < ncpus; ++cpu) {
-                       timer = &per_cpu(heartbeat_timer, cpu);
-                       setup_timer(timer, chrp_event_scan, 0);
-                       timer->expires = jiffies + offset;
-                       add_timer_on(timer, cpu);
-                       offset += interval;
-               }
-               printk("RTAS Event Scan Rate: %u (%lu jiffies)\n",
-                      *p, interval);
-       }
-       of_node_put(device);
-
        if (ppc_md.progress)
                ppc_md.progress("  Have fun!    ", 0x7777);
 }
index 94f44475883685cdb6a6c2a940baac2c24c53622..f8446ea31189e4fc3896f99d124e56cbb0fd02d9 100644 (file)
@@ -214,7 +214,7 @@ void __init iSeries_activate_IRQs()
        unsigned long flags;
 
        for_each_irq (irq) {
-               struct irq_desc *desc = get_irq_desc(irq);
+               struct irq_desc *desc = irq_to_desc(irq);
 
                if (desc && desc->chip && desc->chip->startup) {
                        spin_lock_irqsave(&desc->lock, flags);
index d212006a5b3c0d4ca5213acdf1d02e0939add083..484d21e55c6195351d143aa3b1f3a63b8b014c26 100644 (file)
@@ -152,12 +152,12 @@ static unsigned int pmac_startup_irq(unsigned int virq)
         unsigned long bit = 1UL << (src & 0x1f);
         int i = src >> 5;
 
-       spin_lock_irqsave(&pmac_pic_lock, flags);
-       if ((irq_desc[virq].status & IRQ_LEVEL) == 0)
+       spin_lock_irqsave(&pmac_pic_lock, flags);
+       if ((irq_to_desc(virq)->status & IRQ_LEVEL) == 0)
                out_le32(&pmac_irq_hw[i]->ack, bit);
         __set_bit(src, ppc_cached_irq_mask);
         __pmac_set_irq_mask(src, 0);
-       spin_unlock_irqrestore(&pmac_pic_lock, flags);
+       spin_unlock_irqrestore(&pmac_pic_lock, flags);
 
        return 0;
 }
@@ -285,7 +285,7 @@ static int pmac_pic_host_match(struct irq_host *h, struct device_node *node)
 static int pmac_pic_host_map(struct irq_host *h, unsigned int virq,
                             irq_hw_number_t hw)
 {
-       struct irq_desc *desc = get_irq_desc(virq);
+       struct irq_desc *desc = irq_to_desc(virq);
        int level;
 
        if (hw >= max_irqs)
index 189a25b80735cb7c6731b3ce5d7ab9cd24ca605e..e81b028a2a48d8aca2efd0dcd7357ee9b4e9e62e 100644 (file)
@@ -34,7 +34,7 @@
 #if defined(DEBUG)
 #define DBG udbg_printf
 #else
-#define DBG pr_debug
+#define DBG pr_devel
 #endif
 
 enum {
index f0e6f28427bdb00bf30d0164bbc751ad964b4c3b..27554c807fd503ffea29fb966438ce2f6e229052 100644 (file)
@@ -4,6 +4,7 @@ config PPC_PSERIES
        select MPIC
        select PPC_I8259
        select PPC_RTAS
+       select PPC_RTAS_DAEMON
        select RTAS_ERROR_LOGGING
        select PPC_UDBG_16550
        select PPC_NATIVE
@@ -59,7 +60,7 @@ config PPC_SMLPAR
 
 config CMM
        tristate "Collaborative memory management"
-       depends on PPC_SMLPAR && !CRASH_DUMP
+       depends on PPC_SMLPAR
        default y
        help
          Select this option, if you want to enable the kernel interface
index 790c0b872d4fc617594ba826dcbeab9fb1d3d837..4b1c422b814529a845a5a5161a9074dff2669e00 100644 (file)
@@ -7,7 +7,7 @@ EXTRA_CFLAGS            += -DDEBUG
 endif
 
 obj-y                  := lpar.o hvCall.o nvram.o reconfig.o \
-                          setup.o iommu.o ras.o rtasd.o \
+                          setup.o iommu.o ras.o \
                           firmware.o power.o
 obj-$(CONFIG_SMP)      += smp.o
 obj-$(CONFIG_XICS)     += xics.o
index 6567439fe78dda4512e0524332255df66b5816a1..bcdcf0ccc8d799ffde04e138e1b6fa2736d968e2 100644 (file)
@@ -229,8 +229,9 @@ static void cmm_get_mpp(void)
 {
        int rc;
        struct hvcall_mpp_data mpp_data;
-       unsigned long active_pages_target;
-       signed long page_loan_request;
+       signed long active_pages_target, page_loan_request, target;
+       signed long total_pages = totalram_pages + loaned_pages;
+       signed long min_mem_pages = (min_mem_mb * 1024 * 1024) / PAGE_SIZE;
 
        rc = h_get_mpp(&mpp_data);
 
@@ -238,17 +239,25 @@ static void cmm_get_mpp(void)
                return;
 
        page_loan_request = div_s64((s64)mpp_data.loan_request, PAGE_SIZE);
-       loaned_pages_target = page_loan_request + loaned_pages;
-       if (loaned_pages_target > oom_freed_pages)
-               loaned_pages_target -= oom_freed_pages;
+       target = page_loan_request + (signed long)loaned_pages;
+
+       if (target < 0 || total_pages < min_mem_pages)
+               target = 0;
+
+       if (target > oom_freed_pages)
+               target -= oom_freed_pages;
        else
-               loaned_pages_target = 0;
+               target = 0;
+
+       active_pages_target = total_pages - target;
+
+       if (min_mem_pages > active_pages_target)
+               target = total_pages - min_mem_pages;
 
-       active_pages_target = totalram_pages + loaned_pages - loaned_pages_target;
+       if (target < 0)
+               target = 0;
 
-       if ((min_mem_mb * 1024 * 1024) > (active_pages_target * PAGE_SIZE))
-               loaned_pages_target = totalram_pages + loaned_pages -
-                       ((min_mem_mb * 1024 * 1024) / PAGE_SIZE);
+       loaned_pages_target = target;
 
        cmm_dbg("delta = %ld, loaned = %lu, target = %lu, oom = %lu, totalram = %lu\n",
                page_loan_request, loaned_pages, loaned_pages_target,
index 0e8db6771252a2f3dcb4c28891e0f6d2958389a0..ef8e45448480836bba0097606e6dca9c92afb6a3 100644 (file)
@@ -63,22 +63,6 @@ static void print_device_node_tree(struct pci_dn *pdn, int dent)
 }
 #endif
 
-/** 
- * irq_in_use - return true if this irq is being used 
- */
-static int irq_in_use(unsigned int irq)
-{
-       int rc = 0;
-       unsigned long flags;
-   struct irq_desc *desc = irq_desc + irq;
-
-       spin_lock_irqsave(&desc->lock, flags);
-       if (desc->action)
-               rc = 1;
-       spin_unlock_irqrestore(&desc->lock, flags);
-       return rc;
-}
-
 /**
  * eeh_disable_irq - disable interrupt for the recovering device
  */
@@ -93,7 +77,7 @@ static void eeh_disable_irq(struct pci_dev *dev)
        if (dev->msi_enabled || dev->msix_enabled)
                return;
 
-       if (!irq_in_use(dev->irq))
+       if (!irq_has_action(dev->irq))
                return;
 
        PCI_DN(dn)->eeh_mode |= EEH_MODE_IRQ_DISABLED;
index 2e2bbe120b908ec2816067214006f969f6504492..5182d2b992c6c7b16507a94733e46608cface63f 100644 (file)
@@ -184,7 +184,7 @@ static int pSeries_reconfig_remove_node(struct device_node *np)
 }
 
 /*
- * /proc/ppc64/ofdt - yucky binary interface for adding and removing
+ * /proc/powerpc/ofdt - yucky binary interface for adding and removing
  * OF device nodes.  Should be deprecated as soon as we get an
  * in-kernel wrapper for the RTAS ibm,configure-connector call.
  */
@@ -543,7 +543,7 @@ static const struct file_operations ofdt_fops = {
        .write = ofdt_write
 };
 
-/* create /proc/ppc64/ofdt write-only by root */
+/* create /proc/powerpc/ofdt write-only by root */
 static int proc_ppc64_create_ofdt(void)
 {
        struct proc_dir_entry *ent;
@@ -551,7 +551,7 @@ static int proc_ppc64_create_ofdt(void)
        if (!machine_is(pseries))
                return 0;
 
-       ent = proc_create("ppc64/ofdt", S_IWUSR, NULL, &ofdt_fops);
+       ent = proc_create("powerpc/ofdt", S_IWUSR, NULL, &ofdt_fops);
        if (ent)
                ent->size = 0;
 
diff --git a/arch/powerpc/platforms/pseries/rtasd.c b/arch/powerpc/platforms/pseries/rtasd.c
deleted file mode 100644 (file)
index b3cbac8..0000000
+++ /dev/null
@@ -1,519 +0,0 @@
-/*
- * Copyright (C) 2001 Anton Blanchard <anton@au.ibm.com>, IBM
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Communication to userspace based on kernel/printk.c
- */
-
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/poll.h>
-#include <linux/proc_fs.h>
-#include <linux/init.h>
-#include <linux/vmalloc.h>
-#include <linux/spinlock.h>
-#include <linux/cpu.h>
-#include <linux/workqueue.h>
-
-#include <asm/uaccess.h>
-#include <asm/io.h>
-#include <asm/rtas.h>
-#include <asm/prom.h>
-#include <asm/nvram.h>
-#include <asm/atomic.h>
-#include <asm/machdep.h>
-
-
-static DEFINE_SPINLOCK(rtasd_log_lock);
-
-static DECLARE_WAIT_QUEUE_HEAD(rtas_log_wait);
-
-static char *rtas_log_buf;
-static unsigned long rtas_log_start;
-static unsigned long rtas_log_size;
-
-static int surveillance_timeout = -1;
-static unsigned int rtas_error_log_max;
-static unsigned int rtas_error_log_buffer_max;
-
-/* RTAS service tokens */
-static unsigned int event_scan;
-static unsigned int rtas_event_scan_rate;
-
-static int full_rtas_msgs = 0;
-
-/* Stop logging to nvram after first fatal error */
-static int logging_enabled; /* Until we initialize everything,
-                             * make sure we don't try logging
-                             * anything */
-static int error_log_cnt;
-
-/*
- * Since we use 32 bit RTAS, the physical address of this must be below
- * 4G or else bad things happen. Allocate this in the kernel data and
- * make it big enough.
- */
-static unsigned char logdata[RTAS_ERROR_LOG_MAX];
-
-static char *rtas_type[] = {
-       "Unknown", "Retry", "TCE Error", "Internal Device Failure",
-       "Timeout", "Data Parity", "Address Parity", "Cache Parity",
-       "Address Invalid", "ECC Uncorrected", "ECC Corrupted",
-};
-
-static char *rtas_event_type(int type)
-{
-       if ((type > 0) && (type < 11))
-               return rtas_type[type];
-
-       switch (type) {
-               case RTAS_TYPE_EPOW:
-                       return "EPOW";
-               case RTAS_TYPE_PLATFORM:
-                       return "Platform Error";
-               case RTAS_TYPE_IO:
-                       return "I/O Event";
-               case RTAS_TYPE_INFO:
-                       return "Platform Information Event";
-               case RTAS_TYPE_DEALLOC:
-                       return "Resource Deallocation Event";
-               case RTAS_TYPE_DUMP:
-                       return "Dump Notification Event";
-       }
-
-       return rtas_type[0];
-}
-
-/* To see this info, grep RTAS /var/log/messages and each entry
- * will be collected together with obvious begin/end.
- * There will be a unique identifier on the begin and end lines.
- * This will persist across reboots.
- *
- * format of error logs returned from RTAS:
- * bytes       (size)  : contents
- * --------------------------------------------------------
- * 0-7         (8)     : rtas_error_log
- * 8-47                (40)    : extended info
- * 48-51       (4)     : vendor id
- * 52-1023 (vendor specific) : location code and debug data
- */
-static void printk_log_rtas(char *buf, int len)
-{
-
-       int i,j,n = 0;
-       int perline = 16;
-       char buffer[64];
-       char * str = "RTAS event";
-
-       if (full_rtas_msgs) {
-               printk(RTAS_DEBUG "%d -------- %s begin --------\n",
-                      error_log_cnt, str);
-
-               /*
-                * Print perline bytes on each line, each line will start
-                * with RTAS and a changing number, so syslogd will
-                * print lines that are otherwise the same.  Separate every
-                * 4 bytes with a space.
-                */
-               for (i = 0; i < len; i++) {
-                       j = i % perline;
-                       if (j == 0) {
-                               memset(buffer, 0, sizeof(buffer));
-                               n = sprintf(buffer, "RTAS %d:", i/perline);
-                       }
-
-                       if ((i % 4) == 0)
-                               n += sprintf(buffer+n, " ");
-
-                       n += sprintf(buffer+n, "%02x", (unsigned char)buf[i]);
-
-                       if (j == (perline-1))
-                               printk(KERN_DEBUG "%s\n", buffer);
-               }
-               if ((i % perline) != 0)
-                       printk(KERN_DEBUG "%s\n", buffer);
-
-               printk(RTAS_DEBUG "%d -------- %s end ----------\n",
-                      error_log_cnt, str);
-       } else {
-               struct rtas_error_log *errlog = (struct rtas_error_log *)buf;
-
-               printk(RTAS_DEBUG "event: %d, Type: %s, Severity: %d\n",
-                      error_log_cnt, rtas_event_type(errlog->type),
-                      errlog->severity);
-       }
-}
-
-static int log_rtas_len(char * buf)
-{
-       int len;
-       struct rtas_error_log *err;
-
-       /* rtas fixed header */
-       len = 8;
-       err = (struct rtas_error_log *)buf;
-       if (err->extended_log_length) {
-
-               /* extended header */
-               len += err->extended_log_length;
-       }
-
-       if (rtas_error_log_max == 0)
-               rtas_error_log_max = rtas_get_error_log_max();
-
-       if (len > rtas_error_log_max)
-               len = rtas_error_log_max;
-
-       return len;
-}
-
-/*
- * First write to nvram, if fatal error, that is the only
- * place we log the info.  The error will be picked up
- * on the next reboot by rtasd.  If not fatal, run the
- * method for the type of error.  Currently, only RTAS
- * errors have methods implemented, but in the future
- * there might be a need to store data in nvram before a
- * call to panic().
- *
- * XXX We write to nvram periodically, to indicate error has
- * been written and sync'd, but there is a possibility
- * that if we don't shutdown correctly, a duplicate error
- * record will be created on next reboot.
- */
-void pSeries_log_error(char *buf, unsigned int err_type, int fatal)
-{
-       unsigned long offset;
-       unsigned long s;
-       int len = 0;
-
-       pr_debug("rtasd: logging event\n");
-       if (buf == NULL)
-               return;
-
-       spin_lock_irqsave(&rtasd_log_lock, s);
-
-       /* get length and increase count */
-       switch (err_type & ERR_TYPE_MASK) {
-       case ERR_TYPE_RTAS_LOG:
-               len = log_rtas_len(buf);
-               if (!(err_type & ERR_FLAG_BOOT))
-                       error_log_cnt++;
-               break;
-       case ERR_TYPE_KERNEL_PANIC:
-       default:
-               WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
-               spin_unlock_irqrestore(&rtasd_log_lock, s);
-               return;
-       }
-
-       /* Write error to NVRAM */
-       if (logging_enabled && !(err_type & ERR_FLAG_BOOT))
-               nvram_write_error_log(buf, len, err_type, error_log_cnt);
-
-       /*
-        * rtas errors can occur during boot, and we do want to capture
-        * those somewhere, even if nvram isn't ready (why not?), and even
-        * if rtasd isn't ready. Put them into the boot log, at least.
-        */
-       if ((err_type & ERR_TYPE_MASK) == ERR_TYPE_RTAS_LOG)
-               printk_log_rtas(buf, len);
-
-       /* Check to see if we need to or have stopped logging */
-       if (fatal || !logging_enabled) {
-               logging_enabled = 0;
-               WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
-               spin_unlock_irqrestore(&rtasd_log_lock, s);
-               return;
-       }
-
-       /* call type specific method for error */
-       switch (err_type & ERR_TYPE_MASK) {
-       case ERR_TYPE_RTAS_LOG:
-               offset = rtas_error_log_buffer_max *
-                       ((rtas_log_start+rtas_log_size) & LOG_NUMBER_MASK);
-
-               /* First copy over sequence number */
-               memcpy(&rtas_log_buf[offset], (void *) &error_log_cnt, sizeof(int));
-
-               /* Second copy over error log data */
-               offset += sizeof(int);
-               memcpy(&rtas_log_buf[offset], buf, len);
-
-               if (rtas_log_size < LOG_NUMBER)
-                       rtas_log_size += 1;
-               else
-                       rtas_log_start += 1;
-
-               WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
-               spin_unlock_irqrestore(&rtasd_log_lock, s);
-               wake_up_interruptible(&rtas_log_wait);
-               break;
-       case ERR_TYPE_KERNEL_PANIC:
-       default:
-               WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
-               spin_unlock_irqrestore(&rtasd_log_lock, s);
-               return;
-       }
-
-}
-
-
-static int rtas_log_open(struct inode * inode, struct file * file)
-{
-       return 0;
-}
-
-static int rtas_log_release(struct inode * inode, struct file * file)
-{
-       return 0;
-}
-
-/* This will check if all events are logged, if they are then, we
- * know that we can safely clear the events in NVRAM.
- * Next we'll sit and wait for something else to log.
- */
-static ssize_t rtas_log_read(struct file * file, char __user * buf,
-                        size_t count, loff_t *ppos)
-{
-       int error;
-       char *tmp;
-       unsigned long s;
-       unsigned long offset;
-
-       if (!buf || count < rtas_error_log_buffer_max)
-               return -EINVAL;
-
-       count = rtas_error_log_buffer_max;
-
-       if (!access_ok(VERIFY_WRITE, buf, count))
-               return -EFAULT;
-
-       tmp = kmalloc(count, GFP_KERNEL);
-       if (!tmp)
-               return -ENOMEM;
-
-       spin_lock_irqsave(&rtasd_log_lock, s);
-       /* if it's 0, then we know we got the last one (the one in NVRAM) */
-       while (rtas_log_size == 0) {
-               if (file->f_flags & O_NONBLOCK) {
-                       spin_unlock_irqrestore(&rtasd_log_lock, s);
-                       error = -EAGAIN;
-                       goto out;
-               }
-
-               if (!logging_enabled) {
-                       spin_unlock_irqrestore(&rtasd_log_lock, s);
-                       error = -ENODATA;
-                       goto out;
-               }
-               nvram_clear_error_log();
-
-               spin_unlock_irqrestore(&rtasd_log_lock, s);
-               error = wait_event_interruptible(rtas_log_wait, rtas_log_size);
-               if (error)
-                       goto out;
-               spin_lock_irqsave(&rtasd_log_lock, s);
-       }
-
-       offset = rtas_error_log_buffer_max * (rtas_log_start & LOG_NUMBER_MASK);
-       memcpy(tmp, &rtas_log_buf[offset], count);
-
-       rtas_log_start += 1;
-       rtas_log_size -= 1;
-       spin_unlock_irqrestore(&rtasd_log_lock, s);
-
-       error = copy_to_user(buf, tmp, count) ? -EFAULT : count;
-out:
-       kfree(tmp);
-       return error;
-}
-
-static unsigned int rtas_log_poll(struct file *file, poll_table * wait)
-{
-       poll_wait(file, &rtas_log_wait, wait);
-       if (rtas_log_size)
-               return POLLIN | POLLRDNORM;
-       return 0;
-}
-
-static const struct file_operations proc_rtas_log_operations = {
-       .read =         rtas_log_read,
-       .poll =         rtas_log_poll,
-       .open =         rtas_log_open,
-       .release =      rtas_log_release,
-};
-
-static int enable_surveillance(int timeout)
-{
-       int error;
-
-       error = rtas_set_indicator(SURVEILLANCE_TOKEN, 0, timeout);
-
-       if (error == 0)
-               return 0;
-
-       if (error == -EINVAL) {
-               printk(KERN_DEBUG "rtasd: surveillance not supported\n");
-               return 0;
-       }
-
-       printk(KERN_ERR "rtasd: could not update surveillance\n");
-       return -1;
-}
-
-static void do_event_scan(void)
-{
-       int error;
-       do {
-               memset(logdata, 0, rtas_error_log_max);
-               error = rtas_call(event_scan, 4, 1, NULL,
-                                 RTAS_EVENT_SCAN_ALL_EVENTS, 0,
-                                 __pa(logdata), rtas_error_log_max);
-               if (error == -1) {
-                       printk(KERN_ERR "event-scan failed\n");
-                       break;
-               }
-
-               if (error == 0)
-                       pSeries_log_error(logdata, ERR_TYPE_RTAS_LOG, 0);
-
-       } while(error == 0);
-}
-
-static void rtas_event_scan(struct work_struct *w);
-DECLARE_DELAYED_WORK(event_scan_work, rtas_event_scan);
-
-/*
- * Delay should be at least one second since some machines have problems if
- * we call event-scan too quickly.
- */
-static unsigned long event_scan_delay = 1*HZ;
-static int first_pass = 1;
-
-static void rtas_event_scan(struct work_struct *w)
-{
-       unsigned int cpu;
-
-       do_event_scan();
-
-       get_online_cpus();
-
-       cpu = next_cpu(smp_processor_id(), cpu_online_map);
-       if (cpu == NR_CPUS) {
-               cpu = first_cpu(cpu_online_map);
-
-               if (first_pass) {
-                       first_pass = 0;
-                       event_scan_delay = 30*HZ/rtas_event_scan_rate;
-
-                       if (surveillance_timeout != -1) {
-                               pr_debug("rtasd: enabling surveillance\n");
-                               enable_surveillance(surveillance_timeout);
-                               pr_debug("rtasd: surveillance enabled\n");
-                       }
-               }
-       }
-
-       schedule_delayed_work_on(cpu, &event_scan_work,
-               __round_jiffies_relative(event_scan_delay, cpu));
-
-       put_online_cpus();
-}
-
-static void start_event_scan(void)
-{
-       unsigned int err_type;
-       int rc;
-
-       printk(KERN_DEBUG "RTAS daemon started\n");
-       pr_debug("rtasd: will sleep for %d milliseconds\n",
-                (30000 / rtas_event_scan_rate));
-
-       /* See if we have any error stored in NVRAM */
-       memset(logdata, 0, rtas_error_log_max);
-       rc = nvram_read_error_log(logdata, rtas_error_log_max,
-                                 &err_type, &error_log_cnt);
-       /* We can use rtas_log_buf now */
-       logging_enabled = 1;
-
-       if (!rc) {
-               if (err_type != ERR_FLAG_ALREADY_LOGGED) {
-                       pSeries_log_error(logdata, err_type | ERR_FLAG_BOOT, 0);
-               }
-       }
-
-       schedule_delayed_work_on(first_cpu(cpu_online_map), &event_scan_work,
-                                event_scan_delay);
-}
-
-static int __init rtas_init(void)
-{
-       struct proc_dir_entry *entry;
-
-       if (!machine_is(pseries))
-               return 0;
-
-       /* No RTAS */
-       event_scan = rtas_token("event-scan");
-       if (event_scan == RTAS_UNKNOWN_SERVICE) {
-               printk(KERN_DEBUG "rtasd: no event-scan on system\n");
-               return -ENODEV;
-       }
-
-       rtas_event_scan_rate = rtas_token("rtas-event-scan-rate");
-       if (rtas_event_scan_rate == RTAS_UNKNOWN_SERVICE) {
-               printk(KERN_ERR "rtasd: no rtas-event-scan-rate on system\n");
-               return -ENODEV;
-       }
-
-       /* Make room for the sequence number */
-       rtas_error_log_max = rtas_get_error_log_max();
-       rtas_error_log_buffer_max = rtas_error_log_max + sizeof(int);
-
-       rtas_log_buf = vmalloc(rtas_error_log_buffer_max*LOG_NUMBER);
-       if (!rtas_log_buf) {
-               printk(KERN_ERR "rtasd: no memory\n");
-               return -ENOMEM;
-       }
-
-       entry = proc_create("ppc64/rtas/error_log", S_IRUSR, NULL,
-                           &proc_rtas_log_operations);
-       if (!entry)
-               printk(KERN_ERR "Failed to create error_log proc entry\n");
-
-       start_event_scan();
-
-       return 0;
-}
-
-static int __init surveillance_setup(char *str)
-{
-       int i;
-
-       if (get_option(&str,&i)) {
-               if (i >= 0 && i <= 255)
-                       surveillance_timeout = i;
-       }
-
-       return 1;
-}
-
-static int __init rtasmsgs_setup(char *str)
-{
-       if (strcmp(str, "on") == 0)
-               full_rtas_msgs = 1;
-       else if (strcmp(str, "off") == 0)
-               full_rtas_msgs = 0;
-
-       return 1;
-}
-__initcall(rtas_init);
-__setup("surveillance=", surveillance_setup);
-__setup("rtasmsgs=", rtasmsgs_setup);
index 417eca79df697ed35eb9b29d41ef15cd1a1891b2..1b45c458f952fc6da9cb0b9e3ebb63416023ae06 100644 (file)
@@ -13,7 +13,7 @@
  * of this data using this driver.  A dump exists if the device-tree
  * /chosen/ibm,scan-log-data property exists.
  *
- * This driver exports /proc/ppc64/scan-log-dump which can be read.
+ * This driver exports /proc/powerpc/scan-log-dump which can be read.
  * The driver supports only sequential reads.
  *
  * The driver looks at a write to the driver for the single word "reset".
@@ -186,7 +186,7 @@ static int __init scanlog_init(void)
        if (!data)
                goto err;
 
-       ent = proc_create_data("ppc64/rtas/scan-log-dump", S_IRUSR, NULL,
+       ent = proc_create_data("powerpc/rtas/scan-log-dump", S_IRUSR, NULL,
                               &scanlog_fops, data);
        if (!ent)
                goto err;
index b9bf0eedccf27a18eefd3c640d44cc37f16b1708..097e8a2a3c5d7cbd2981baf3a6cad4d6eea80ce3 100644 (file)
@@ -157,7 +157,7 @@ static int get_irq_server(unsigned int virq, unsigned int strict_check)
        cpumask_t cpumask;
        cpumask_t tmp = CPU_MASK_NONE;
 
-       cpumask_copy(&cpumask, irq_desc[virq].affinity);
+       cpumask_copy(&cpumask, irq_to_desc(virq)->affinity);
        if (!distribute_irqs)
                return default_server;
 
@@ -428,7 +428,7 @@ static int xics_host_map(struct irq_host *h, unsigned int virq,
        /* Insert the interrupt mapping into the radix tree for fast lookup */
        irq_radix_revmap_insert(xics_host, virq, hw);
 
-       get_irq_desc(virq)->status |= IRQ_LEVEL;
+       irq_to_desc(virq)->status |= IRQ_LEVEL;
        set_irq_chip_and_handler(virq, xics_irq_chip, handle_fasteoi_irq);
        return 0;
 }
@@ -852,7 +852,7 @@ void xics_migrate_irqs_away(void)
                /* We need to get IPIs still. */
                if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS)
                        continue;
-               desc = get_irq_desc(virq);
+               desc = irq_to_desc(virq);
 
                /* We only need to migrate enabled IRQS */
                if (desc == NULL || desc->chip == NULL
@@ -881,7 +881,7 @@ void xics_migrate_irqs_away(void)
                       virq, cpu);
 
                /* Reset affinity to all cpus */
-               cpumask_setall(irq_desc[virq].affinity);
+               cpumask_setall(irq_to_desc(virq)->affinity);
                desc->chip->set_affinity(virq, cpu_all_mask);
 unlock:
                spin_unlock_irqrestore(&desc->lock, flags);
index 82424cd7e1287bcc009b81f2327b078229a6050f..523537300ad5dcef02959c7b5629dda3be0ea04c 100644 (file)
@@ -102,7 +102,7 @@ static int cpm_pic_host_map(struct irq_host *h, unsigned int virq,
 {
        pr_debug("cpm_pic_host_map(%d, 0x%lx)\n", virq, hw);
 
-       get_irq_desc(virq)->status |= IRQ_LEVEL;
+       irq_to_desc(virq)->status |= IRQ_LEVEL;
        set_irq_chip_and_handler(virq, &cpm_pic, handle_fasteoi_irq);
        return 0;
 }
index 78f1f7cca0a017ba6ee17d83065ba1544f0dbb71..722cf72e190deb5976ceb8f4749ac0fe595dc993 100644 (file)
@@ -115,11 +115,13 @@ static void cpm2_ack(unsigned int virq)
 
 static void cpm2_end_irq(unsigned int virq)
 {
+       struct irq_desc *desc;
        int     bit, word;
        unsigned int irq_nr = virq_to_hw(virq);
 
-       if (!(irq_desc[irq_nr].status & (IRQ_DISABLED|IRQ_INPROGRESS))
-                       && irq_desc[irq_nr].action) {
+       desc = irq_to_desc(irq_nr);
+       if (!(desc->status & (IRQ_DISABLED|IRQ_INPROGRESS))
+                       && desc->action) {
 
                bit = irq_to_siubit[irq_nr];
                word = irq_to_siureg[irq_nr];
@@ -138,7 +140,7 @@ static void cpm2_end_irq(unsigned int virq)
 static int cpm2_set_irq_type(unsigned int virq, unsigned int flow_type)
 {
        unsigned int src = virq_to_hw(virq);
-       struct irq_desc *desc = get_irq_desc(virq);
+       struct irq_desc *desc = irq_to_desc(virq);
        unsigned int vold, vnew, edibit;
 
        if (flow_type == IRQ_TYPE_NONE)
@@ -210,7 +212,7 @@ static int cpm2_pic_host_map(struct irq_host *h, unsigned int virq,
 {
        pr_debug("cpm2_pic_host_map(%d, 0x%lx)\n", virq, hw);
 
-       get_irq_desc(virq)->status |= IRQ_LEVEL;
+       irq_to_desc(virq)->status |= IRQ_LEVEL;
        set_irq_chip_and_handler(virq, &cpm2_pic, handle_level_irq);
        return 0;
 }
index da38a1ff97bb47b0b58955191313dbec4b233ea5..7174374f90ff41513ba45393d65de540d72050f5 100644 (file)
@@ -55,7 +55,7 @@ static int fsl_msi_host_map(struct irq_host *h, unsigned int virq,
 {
        struct irq_chip *chip = &fsl_msi_chip;
 
-       get_irq_desc(virq)->status |= IRQ_TYPE_EDGE_FALLING;
+       irq_to_desc(virq)->status |= IRQ_TYPE_EDGE_FALLING;
 
        set_irq_chip_and_handler(virq, chip, handle_edge_irq);
 
index a96584ab33dda7e9e8140bf1db9bae897364545d..78ed945453db5cf5d198d660c6883b12d50d7b9f 100644 (file)
@@ -175,12 +175,12 @@ static int i8259_host_map(struct irq_host *h, unsigned int virq,
 
        /* We block the internal cascade */
        if (hw == 2)
-               get_irq_desc(virq)->status |= IRQ_NOREQUEST;
+               irq_to_desc(virq)->status |= IRQ_NOREQUEST;
 
        /* We use the level handler only for now, we might want to
         * be more cautious here but that works for now
         */
-       get_irq_desc(virq)->status |= IRQ_LEVEL;
+       irq_to_desc(virq)->status |= IRQ_LEVEL;
        set_irq_chip_and_handler(virq, &i8259_pic, handle_level_irq);
        return 0;
 }
index cb7689c4bfbd5e7f3578a78961e3a0293f20de8b..f042c1d6900295d50f1bdad17ecc252145b6f5d7 100644 (file)
@@ -605,7 +605,7 @@ static int ipic_set_irq_type(unsigned int virq, unsigned int flow_type)
 {
        struct ipic *ipic = ipic_from_irq(virq);
        unsigned int src = ipic_irq_to_hw(virq);
-       struct irq_desc *desc = get_irq_desc(virq);
+       struct irq_desc *desc = irq_to_desc(virq);
        unsigned int vold, vnew, edibit;
 
        if (flow_type == IRQ_TYPE_NONE)
index 5d2d5522ef416846a0ba9bf8073c05652227d4c7..01179587df2ac86cf2661617a03c34ad75f16da2 100644 (file)
@@ -72,7 +72,7 @@ static void mpc8xx_end_irq(unsigned int virq)
 
 static int mpc8xx_set_irq_type(unsigned int virq, unsigned int flow_type)
 {
-       struct irq_desc *desc = get_irq_desc(virq);
+       struct irq_desc *desc = irq_to_desc(virq);
 
        desc->status &= ~(IRQ_TYPE_SENSE_MASK | IRQ_LEVEL);
        desc->status |= flow_type & IRQ_TYPE_SENSE_MASK;
index 30c44e6b0413e5981f2500fab9c4ad1c8a3a493a..4fd57ab956bf6975b2279df3ae1820ada1a16f8a 100644 (file)
@@ -572,7 +572,7 @@ static int irq_choose_cpu(unsigned int virt_irq)
        cpumask_t mask;
        int cpuid;
 
-       cpumask_copy(&mask, irq_desc[virt_irq].affinity);
+       cpumask_copy(&mask, irq_to_desc(virt_irq)->affinity);
        if (cpus_equal(mask, CPU_MASK_ALL)) {
                static int irq_rover;
                static DEFINE_SPINLOCK(irq_rover_lock);
@@ -621,7 +621,7 @@ static struct mpic *mpic_find(unsigned int irq)
        if (irq < NUM_ISA_INTERRUPTS)
                return NULL;
 
-       return irq_desc[irq].chip_data;
+       return irq_to_desc(irq)->chip_data;
 }
 
 /* Determine if the linux irq is an IPI */
@@ -648,14 +648,14 @@ static inline u32 mpic_physmask(u32 cpumask)
 /* Get the mpic structure from the IPI number */
 static inline struct mpic * mpic_from_ipi(unsigned int ipi)
 {
-       return irq_desc[ipi].chip_data;
+       return irq_to_desc(ipi)->chip_data;
 }
 #endif
 
 /* Get the mpic structure from the irq number */
 static inline struct mpic * mpic_from_irq(unsigned int irq)
 {
-       return irq_desc[irq].chip_data;
+       return irq_to_desc(irq)->chip_data;
 }
 
 /* Send an EOI */
@@ -735,7 +735,7 @@ static void mpic_unmask_ht_irq(unsigned int irq)
 
        mpic_unmask_irq(irq);
 
-       if (irq_desc[irq].status & IRQ_LEVEL)
+       if (irq_to_desc(irq)->status & IRQ_LEVEL)
                mpic_ht_end_irq(mpic, src);
 }
 
@@ -745,7 +745,7 @@ static unsigned int mpic_startup_ht_irq(unsigned int irq)
        unsigned int src = mpic_irq_to_hw(irq);
 
        mpic_unmask_irq(irq);
-       mpic_startup_ht_interrupt(mpic, src, irq_desc[irq].status);
+       mpic_startup_ht_interrupt(mpic, src, irq_to_desc(irq)->status);
 
        return 0;
 }
@@ -755,7 +755,7 @@ static void mpic_shutdown_ht_irq(unsigned int irq)
        struct mpic *mpic = mpic_from_irq(irq);
        unsigned int src = mpic_irq_to_hw(irq);
 
-       mpic_shutdown_ht_interrupt(mpic, src, irq_desc[irq].status);
+       mpic_shutdown_ht_interrupt(mpic, src, irq_to_desc(irq)->status);
        mpic_mask_irq(irq);
 }
 
@@ -772,7 +772,7 @@ static void mpic_end_ht_irq(unsigned int irq)
         * latched another edge interrupt coming in anyway
         */
 
-       if (irq_desc[irq].status & IRQ_LEVEL)
+       if (irq_to_desc(irq)->status & IRQ_LEVEL)
                mpic_ht_end_irq(mpic, src);
        mpic_eoi(mpic);
 }
@@ -856,7 +856,7 @@ int mpic_set_irq_type(unsigned int virq, unsigned int flow_type)
 {
        struct mpic *mpic = mpic_from_irq(virq);
        unsigned int src = mpic_irq_to_hw(virq);
-       struct irq_desc *desc = get_irq_desc(virq);
+       struct irq_desc *desc = irq_to_desc(virq);
        unsigned int vecpri, vold, vnew;
 
        DBG("mpic: set_irq_type(mpic:@%p,virq:%d,src:0x%x,type:0x%x)\n",
index 2aa4ed066db11bcec9bcfa7127b3b7c50eb9b8d9..485b92477d7c19de26fb09403fd1e7d6a5f99a6c 100644 (file)
@@ -213,7 +213,7 @@ static int mv64x60_host_map(struct irq_host *h, unsigned int virq,
 {
        int level1;
 
-       get_irq_desc(virq)->status |= IRQ_LEVEL;
+       irq_to_desc(virq)->status |= IRQ_LEVEL;
 
        level1 = (hwirq & MV64x60_LEVEL1_MASK) >> MV64x60_LEVEL1_OFFSET;
        BUG_ON(level1 > MV64x60_LEVEL1_GPP);
index 3faa42e03a85af05a379efaa2f103093b6d5fb9a..fc098744ad822e4fccd606c29c19eea2dcaf0fa8 100644 (file)
@@ -189,7 +189,7 @@ static inline void qe_ic_write(volatile __be32  __iomem * base, unsigned int reg
 
 static inline struct qe_ic *qe_ic_from_irq(unsigned int virq)
 {
-       return irq_desc[virq].chip_data;
+       return irq_to_desc(virq)->chip_data;
 }
 
 #define virq_to_hw(virq)       ((unsigned int)irq_map[virq].hwirq)
@@ -263,7 +263,7 @@ static int qe_ic_host_map(struct irq_host *h, unsigned int virq,
        chip = &qe_ic->hc_irq;
 
        set_irq_chip_data(virq, qe_ic);
-       get_irq_desc(virq)->status |= IRQ_LEVEL;
+       irq_to_desc(virq)->status |= IRQ_LEVEL;
 
        set_irq_chip_and_handler(virq, chip, handle_level_irq);
 
index cf244a419e9650cd631189a26b59455a64ef5f3b..02f600991dcefd0875765d60eacb23ae54716c8a 100644 (file)
@@ -398,7 +398,7 @@ static int pci_irq_host_map(struct irq_host *h, unsigned int virq,
        DBG("%s(%d, 0x%lx)\n", __func__, virq, hw);
        if ((virq >= 1) && (virq <= 4)){
                irq = virq + IRQ_PCI_INTAD_BASE - 1;
-               get_irq_desc(irq)->status |= IRQ_LEVEL;
+               irq_to_desc(irq)->status |= IRQ_LEVEL;
                set_irq_chip(irq, &tsi108_pci_irq);
        }
        return 0;
index 466ce9ace1270ec881a4ae2f18e92a70648af40b..cf97935863c869b289854857d6b5e3e7198b4ba6 100644 (file)
@@ -57,7 +57,7 @@ struct uic {
 
 static void uic_unmask_irq(unsigned int virq)
 {
-       struct irq_desc *desc = get_irq_desc(virq);
+       struct irq_desc *desc = irq_to_desc(virq);
        struct uic *uic = get_irq_chip_data(virq);
        unsigned int src = uic_irq_to_hw(virq);
        unsigned long flags;
@@ -101,7 +101,7 @@ static void uic_ack_irq(unsigned int virq)
 
 static void uic_mask_ack_irq(unsigned int virq)
 {
-       struct irq_desc *desc = get_irq_desc(virq);
+       struct irq_desc *desc = irq_to_desc(virq);
        struct uic *uic = get_irq_chip_data(virq);
        unsigned int src = uic_irq_to_hw(virq);
        unsigned long flags;
@@ -129,7 +129,7 @@ static int uic_set_irq_type(unsigned int virq, unsigned int flow_type)
 {
        struct uic *uic = get_irq_chip_data(virq);
        unsigned int src = uic_irq_to_hw(virq);
-       struct irq_desc *desc = get_irq_desc(virq);
+       struct irq_desc *desc = irq_to_desc(virq);
        unsigned long flags;
        int trigger, polarity;
        u32 tr, pr, mask;
index 40edad520770e335c1afc7a7af0463f159972ef4..ab743718876b59ecd5bfba5378502092219007c3 100644 (file)
@@ -79,7 +79,7 @@ static void xilinx_intc_mask(unsigned int virq)
 
 static int xilinx_intc_set_type(unsigned int virq, unsigned int flow_type)
 {
-       struct irq_desc *desc = get_irq_desc(virq);
+       struct irq_desc *desc = irq_to_desc(virq);
 
        desc->status &= ~(IRQ_TYPE_SENSE_MASK | IRQ_LEVEL);
        desc->status |= flow_type & IRQ_TYPE_SENSE_MASK;
index 556f0feaa4df33fe3733e0ccf3638bb6bfe1aa44..95b676a19be72c791235a5d3c9594a0911f1fa22 100644 (file)
@@ -387,7 +387,7 @@ static int probe_thermostat(struct i2c_client *client,
        i2c_set_clientdata(client, th);
        th->clt = client;
 
-       rc = read_reg(th, 0);
+       rc = read_reg(th, CONFIG_REG);
        if (rc < 0) {
                dev_err(&client->dev, "Thermostat failed to read config!\n");
                kfree(th);
index 298de0f95d70d24d15727ab7b54a28d438854ec2..d58ade170c4b27fd50bf5abc471eac618c4eea3e 100644 (file)
@@ -65,47 +65,322 @@ static int of_platform_device_remove(struct device *dev)
        return 0;
 }
 
-static int of_platform_device_suspend(struct device *dev, pm_message_t state)
+static void of_platform_device_shutdown(struct device *dev)
 {
        struct of_device *of_dev = to_of_device(dev);
        struct of_platform_driver *drv = to_of_platform_driver(dev->driver);
-       int error = 0;
 
-       if (dev->driver && drv->suspend)
-               error = drv->suspend(of_dev, state);
-       return error;
+       if (dev->driver && drv->shutdown)
+               drv->shutdown(of_dev);
 }
 
-static int of_platform_device_resume(struct device * dev)
+#ifdef CONFIG_PM_SLEEP
+
+static int of_platform_legacy_suspend(struct device *dev, pm_message_t mesg)
 {
        struct of_device *of_dev = to_of_device(dev);
        struct of_platform_driver *drv = to_of_platform_driver(dev->driver);
-       int error = 0;
+       int ret = 0;
 
-       if (dev->driver && drv->resume)
-               error = drv->resume(of_dev);
-       return error;
+       if (dev->driver && drv->suspend)
+               ret = drv->suspend(of_dev, mesg);
+       return ret;
 }
 
-static void of_platform_device_shutdown(struct device *dev)
+static int of_platform_legacy_resume(struct device *dev)
 {
        struct of_device *of_dev = to_of_device(dev);
        struct of_platform_driver *drv = to_of_platform_driver(dev->driver);
+       int ret = 0;
 
-       if (dev->driver && drv->shutdown)
-               drv->shutdown(of_dev);
+       if (dev->driver && drv->resume)
+               ret = drv->resume(of_dev);
+       return ret;
+}
+
+static int of_platform_pm_prepare(struct device *dev)
+{
+       struct device_driver *drv = dev->driver;
+       int ret = 0;
+
+       if (drv && drv->pm && drv->pm->prepare)
+               ret = drv->pm->prepare(dev);
+
+       return ret;
+}
+
+static void of_platform_pm_complete(struct device *dev)
+{
+       struct device_driver *drv = dev->driver;
+
+       if (drv && drv->pm && drv->pm->complete)
+               drv->pm->complete(dev);
+}
+
+#ifdef CONFIG_SUSPEND
+
+static int of_platform_pm_suspend(struct device *dev)
+{
+       struct device_driver *drv = dev->driver;
+       int ret = 0;
+
+       if (!drv)
+               return 0;
+
+       if (drv->pm) {
+               if (drv->pm->suspend)
+                       ret = drv->pm->suspend(dev);
+       } else {
+               ret = of_platform_legacy_suspend(dev, PMSG_SUSPEND);
+       }
+
+       return ret;
 }
 
+static int of_platform_pm_suspend_noirq(struct device *dev)
+{
+       struct device_driver *drv = dev->driver;
+       int ret = 0;
+
+       if (!drv)
+               return 0;
+
+       if (drv->pm) {
+               if (drv->pm->suspend_noirq)
+                       ret = drv->pm->suspend_noirq(dev);
+       }
+
+       return ret;
+}
+
+static int of_platform_pm_resume(struct device *dev)
+{
+       struct device_driver *drv = dev->driver;
+       int ret = 0;
+
+       if (!drv)
+               return 0;
+
+       if (drv->pm) {
+               if (drv->pm->resume)
+                       ret = drv->pm->resume(dev);
+       } else {
+               ret = of_platform_legacy_resume(dev);
+       }
+
+       return ret;
+}
+
+static int of_platform_pm_resume_noirq(struct device *dev)
+{
+       struct device_driver *drv = dev->driver;
+       int ret = 0;
+
+       if (!drv)
+               return 0;
+
+       if (drv->pm) {
+               if (drv->pm->resume_noirq)
+                       ret = drv->pm->resume_noirq(dev);
+       }
+
+       return ret;
+}
+
+#else /* !CONFIG_SUSPEND */
+
+#define of_platform_pm_suspend         NULL
+#define of_platform_pm_resume          NULL
+#define of_platform_pm_suspend_noirq   NULL
+#define of_platform_pm_resume_noirq    NULL
+
+#endif /* !CONFIG_SUSPEND */
+
+#ifdef CONFIG_HIBERNATION
+
+static int of_platform_pm_freeze(struct device *dev)
+{
+       struct device_driver *drv = dev->driver;
+       int ret = 0;
+
+       if (!drv)
+               return 0;
+
+       if (drv->pm) {
+               if (drv->pm->freeze)
+                       ret = drv->pm->freeze(dev);
+       } else {
+               ret = of_platform_legacy_suspend(dev, PMSG_FREEZE);
+       }
+
+       return ret;
+}
+
+static int of_platform_pm_freeze_noirq(struct device *dev)
+{
+       struct device_driver *drv = dev->driver;
+       int ret = 0;
+
+       if (!drv)
+               return 0;
+
+       if (drv->pm) {
+               if (drv->pm->freeze_noirq)
+                       ret = drv->pm->freeze_noirq(dev);
+       }
+
+       return ret;
+}
+
+static int of_platform_pm_thaw(struct device *dev)
+{
+       struct device_driver *drv = dev->driver;
+       int ret = 0;
+
+       if (!drv)
+               return 0;
+
+       if (drv->pm) {
+               if (drv->pm->thaw)
+                       ret = drv->pm->thaw(dev);
+       } else {
+               ret = of_platform_legacy_resume(dev);
+       }
+
+       return ret;
+}
+
+static int of_platform_pm_thaw_noirq(struct device *dev)
+{
+       struct device_driver *drv = dev->driver;
+       int ret = 0;
+
+       if (!drv)
+               return 0;
+
+       if (drv->pm) {
+               if (drv->pm->thaw_noirq)
+                       ret = drv->pm->thaw_noirq(dev);
+       }
+
+       return ret;
+}
+
+static int of_platform_pm_poweroff(struct device *dev)
+{
+       struct device_driver *drv = dev->driver;
+       int ret = 0;
+
+       if (!drv)
+               return 0;
+
+       if (drv->pm) {
+               if (drv->pm->poweroff)
+                       ret = drv->pm->poweroff(dev);
+       } else {
+               ret = of_platform_legacy_suspend(dev, PMSG_HIBERNATE);
+       }
+
+       return ret;
+}
+
+static int of_platform_pm_poweroff_noirq(struct device *dev)
+{
+       struct device_driver *drv = dev->driver;
+       int ret = 0;
+
+       if (!drv)
+               return 0;
+
+       if (drv->pm) {
+               if (drv->pm->poweroff_noirq)
+                       ret = drv->pm->poweroff_noirq(dev);
+       }
+
+       return ret;
+}
+
+static int of_platform_pm_restore(struct device *dev)
+{
+       struct device_driver *drv = dev->driver;
+       int ret = 0;
+
+       if (!drv)
+               return 0;
+
+       if (drv->pm) {
+               if (drv->pm->restore)
+                       ret = drv->pm->restore(dev);
+       } else {
+               ret = of_platform_legacy_resume(dev);
+       }
+
+       return ret;
+}
+
+static int of_platform_pm_restore_noirq(struct device *dev)
+{
+       struct device_driver *drv = dev->driver;
+       int ret = 0;
+
+       if (!drv)
+               return 0;
+
+       if (drv->pm) {
+               if (drv->pm->restore_noirq)
+                       ret = drv->pm->restore_noirq(dev);
+       }
+
+       return ret;
+}
+
+#else /* !CONFIG_HIBERNATION */
+
+#define of_platform_pm_freeze          NULL
+#define of_platform_pm_thaw            NULL
+#define of_platform_pm_poweroff                NULL
+#define of_platform_pm_restore         NULL
+#define of_platform_pm_freeze_noirq    NULL
+#define of_platform_pm_thaw_noirq              NULL
+#define of_platform_pm_poweroff_noirq  NULL
+#define of_platform_pm_restore_noirq   NULL
+
+#endif /* !CONFIG_HIBERNATION */
+
+static struct dev_pm_ops of_platform_dev_pm_ops = {
+       .prepare = of_platform_pm_prepare,
+       .complete = of_platform_pm_complete,
+       .suspend = of_platform_pm_suspend,
+       .resume = of_platform_pm_resume,
+       .freeze = of_platform_pm_freeze,
+       .thaw = of_platform_pm_thaw,
+       .poweroff = of_platform_pm_poweroff,
+       .restore = of_platform_pm_restore,
+       .suspend_noirq = of_platform_pm_suspend_noirq,
+       .resume_noirq = of_platform_pm_resume_noirq,
+       .freeze_noirq = of_platform_pm_freeze_noirq,
+       .thaw_noirq = of_platform_pm_thaw_noirq,
+       .poweroff_noirq = of_platform_pm_poweroff_noirq,
+       .restore_noirq = of_platform_pm_restore_noirq,
+};
+
+#define OF_PLATFORM_PM_OPS_PTR (&of_platform_dev_pm_ops)
+
+#else /* !CONFIG_PM_SLEEP */
+
+#define OF_PLATFORM_PM_OPS_PTR NULL
+
+#endif /* !CONFIG_PM_SLEEP */
+
 int of_bus_type_init(struct bus_type *bus, const char *name)
 {
        bus->name = name;
        bus->match = of_platform_bus_match;
        bus->probe = of_platform_device_probe;
        bus->remove = of_platform_device_remove;
-       bus->suspend = of_platform_device_suspend;
-       bus->resume = of_platform_device_resume;
        bus->shutdown = of_platform_device_shutdown;
        bus->dev_attrs = of_platform_device_attrs;
+       bus->pm = OF_PLATFORM_PM_OPS_PTR;
        return bus_register(bus);
 }
 
index 7ba79a54948cf049ad277a8c9d2fad0735ace088..123257bb356b38bc8fe614b101a2354a24c82e47 100644 (file)
@@ -7,6 +7,7 @@
 #include <linux/init.h>
 #include <linux/time.h>
 #include <linux/proc_fs.h>
+#include <linux/seq_file.h>
 #include <linux/stat.h>
 #include <linux/string.h>
 #include <asm/prom.h>
@@ -25,26 +26,27 @@ static struct proc_dir_entry *proc_device_tree;
 /*
  * Supply data on a read from /proc/device-tree/node/property.
  */
-static int property_read_proc(char *page, char **start, off_t off,
-                             int count, int *eof, void *data)
+static int property_proc_show(struct seq_file *m, void *v)
 {
-       struct property *pp = data;
-       int n;
+       struct property *pp = m->private;
 
-       if (off >= pp->length) {
-               *eof = 1;
-               return 0;
-       }
-       n = pp->length - off;
-       if (n > count)
-               n = count;
-       else
-               *eof = 1;
-       memcpy(page, (char *)pp->value + off, n);
-       *start = page;
-       return n;
+       seq_write(m, pp->value, pp->length);
+       return 0;
+}
+
+static int property_proc_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, property_proc_show, PDE(inode)->data);
 }
 
+static const struct file_operations property_proc_fops = {
+       .owner          = THIS_MODULE,
+       .open           = property_proc_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
 /*
  * For a node with a name like "gc@10", we make symlinks called "gc"
  * and "@10" to it.
@@ -63,10 +65,9 @@ __proc_device_tree_add_prop(struct proc_dir_entry *de, struct property *pp,
         * Unfortunately proc_register puts each new entry
         * at the beginning of the list.  So we rearrange them.
         */
-       ent = create_proc_read_entry(name,
-                                    strncmp(name, "security-", 9)
-                                    ? S_IRUGO : S_IRUSR, de,
-                                    property_read_proc, pp);
+       ent = proc_create_data(name,
+                              strncmp(name, "security-", 9) ? S_IRUGO : S_IRUSR,
+                              de, &property_proc_fops, pp);
        if (ent == NULL)
                return NULL;
 
index 7495ce3473448cd45ee1dba9e0ae441c83245ca4..cdca63917e772cdb56a58046ac1d6f2a2f5d9bcd 100644 (file)
@@ -48,6 +48,7 @@
 #include <asm/io.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
+#include <asm-generic/bitops/le.h>
 
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
 #include "coalesced_mmio.h"
@@ -1665,8 +1666,8 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
                unsigned long rel_gfn = gfn - memslot->base_gfn;
 
                /* avoid RMW */
-               if (!test_bit(rel_gfn, memslot->dirty_bitmap))
-                       set_bit(rel_gfn, memslot->dirty_bitmap);
+               if (!generic_test_le_bit(rel_gfn, memslot->dirty_bitmap))
+                       generic___set_le_bit(rel_gfn, memslot->dirty_bitmap);
        }
 }