]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/commitdiff
Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/scottwood/linux...
authorMichael Ellerman <mpe@ellerman.id.au>
Fri, 16 Dec 2016 04:05:38 +0000 (15:05 +1100)
committerMichael Ellerman <mpe@ellerman.id.au>
Fri, 16 Dec 2016 04:05:38 +0000 (15:05 +1100)
Freescale updates from Scott:

"Highlights include 8xx hugepage support, qbman fixes/cleanup, device
tree updates, and some misc cleanup."

134 files changed:
arch/powerpc/Kconfig
arch/powerpc/Kconfig.debug
arch/powerpc/Makefile
arch/powerpc/boot/Makefile
arch/powerpc/boot/ps3-head.S
arch/powerpc/boot/ps3.c
arch/powerpc/boot/wrapper
arch/powerpc/configs/powernv_defconfig
arch/powerpc/configs/ppc64_defconfig
arch/powerpc/configs/pseries_defconfig
arch/powerpc/include/asm/asm-prototypes.h
arch/powerpc/include/asm/book3s/32/pgtable.h
arch/powerpc/include/asm/book3s/64/hash-4k.h
arch/powerpc/include/asm/book3s/64/hash-64k.h
arch/powerpc/include/asm/book3s/64/hugetlb-radix.h [deleted file]
arch/powerpc/include/asm/book3s/64/hugetlb.h [new file with mode: 0644]
arch/powerpc/include/asm/book3s/64/mmu-hash.h
arch/powerpc/include/asm/book3s/64/pgtable.h
arch/powerpc/include/asm/book3s/64/radix.h
arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
arch/powerpc/include/asm/cmpxchg.h
arch/powerpc/include/asm/debug.h
arch/powerpc/include/asm/hugetlb.h
arch/powerpc/include/asm/hvcall.h
arch/powerpc/include/asm/kexec.h
arch/powerpc/include/asm/kprobes.h
arch/powerpc/include/asm/machdep.h
arch/powerpc/include/asm/mmu.h
arch/powerpc/include/asm/mmu_context.h
arch/powerpc/include/asm/nohash/32/pgtable.h
arch/powerpc/include/asm/nohash/64/pgtable-4k.h
arch/powerpc/include/asm/nohash/64/pgtable-64k.h
arch/powerpc/include/asm/nohash/64/pgtable.h
arch/powerpc/include/asm/opal.h
arch/powerpc/include/asm/plpar_wrappers.h
arch/powerpc/include/asm/ppc-pci.h
arch/powerpc/include/asm/prom.h
arch/powerpc/include/asm/reg.h
arch/powerpc/include/asm/smp.h
arch/powerpc/include/asm/stackprotector.h [new file with mode: 0644]
arch/powerpc/include/asm/systbl.h
arch/powerpc/include/asm/unistd.h
arch/powerpc/include/uapi/asm/unistd.h
arch/powerpc/kernel/Makefile
arch/powerpc/kernel/asm-offsets.c
arch/powerpc/kernel/cpu_setup_power.S
arch/powerpc/kernel/eeh.c
arch/powerpc/kernel/eeh_driver.c
arch/powerpc/kernel/eeh_event.c
arch/powerpc/kernel/eeh_pe.c
arch/powerpc/kernel/entry_32.S
arch/powerpc/kernel/exceptions-64e.S
arch/powerpc/kernel/ftrace.c
arch/powerpc/kernel/head_64.S
arch/powerpc/kernel/ibmebus.c [deleted file]
arch/powerpc/kernel/kexec_elf_64.c [new file with mode: 0644]
arch/powerpc/kernel/kprobes.c
arch/powerpc/kernel/machine_kexec_64.c
arch/powerpc/kernel/machine_kexec_file_64.c [new file with mode: 0644]
arch/powerpc/kernel/mce.c
arch/powerpc/kernel/misc_32.S
arch/powerpc/kernel/misc_64.S
arch/powerpc/kernel/of_platform.c
arch/powerpc/kernel/process.c
arch/powerpc/kernel/prom.c
arch/powerpc/kernel/prom_init.c
arch/powerpc/kernel/setup-common.c
arch/powerpc/kernel/setup_64.c
arch/powerpc/kernel/smp.c
arch/powerpc/kernel/traps.c
arch/powerpc/kernel/vio.c [deleted file]
arch/powerpc/lib/sstep.c
arch/powerpc/mm/Makefile
arch/powerpc/mm/copro_fault.c
arch/powerpc/mm/dump_linuxpagetables.c
arch/powerpc/mm/fault.c
arch/powerpc/mm/hash_native_64.c
arch/powerpc/mm/hash_utils_64.c
arch/powerpc/mm/mmu_context_book3s64.c
arch/powerpc/mm/mmu_context_iommu.c
arch/powerpc/mm/pgtable-book3s64.c
arch/powerpc/mm/pgtable-radix.c
arch/powerpc/mm/pgtable.c
arch/powerpc/mm/pgtable_64.c
arch/powerpc/mm/tlb-radix.c
arch/powerpc/perf/isa207-common.c
arch/powerpc/perf/isa207-common.h
arch/powerpc/perf/power8-pmu.c
arch/powerpc/perf/power9-pmu.c
arch/powerpc/platforms/40x/Kconfig
arch/powerpc/platforms/44x/Kconfig
arch/powerpc/platforms/85xx/corenet_generic.c
arch/powerpc/platforms/85xx/smp.c
arch/powerpc/platforms/Kconfig
arch/powerpc/platforms/cell/Kconfig
arch/powerpc/platforms/cell/spu_base.c
arch/powerpc/platforms/powernv/eeh-powernv.c
arch/powerpc/platforms/powernv/npu-dma.c
arch/powerpc/platforms/powernv/opal-wrappers.S
arch/powerpc/platforms/powernv/opal.c
arch/powerpc/platforms/powernv/pci-ioda.c
arch/powerpc/platforms/powernv/pci.c
arch/powerpc/platforms/powernv/setup.c
arch/powerpc/platforms/ps3/htab.c
arch/powerpc/platforms/ps3/setup.c
arch/powerpc/platforms/pseries/Kconfig
arch/powerpc/platforms/pseries/Makefile
arch/powerpc/platforms/pseries/dlpar.c
arch/powerpc/platforms/pseries/eeh_pseries.c
arch/powerpc/platforms/pseries/hotplug-memory.c
arch/powerpc/platforms/pseries/ibmebus.c [new file with mode: 0644]
arch/powerpc/platforms/pseries/lpar.c
arch/powerpc/platforms/pseries/setup.c
arch/powerpc/platforms/pseries/vio.c [new file with mode: 0644]
arch/powerpc/purgatory/.gitignore [new file with mode: 0644]
arch/powerpc/purgatory/Makefile [new file with mode: 0644]
arch/powerpc/purgatory/trampoline.S [new file with mode: 0644]
arch/powerpc/xmon/xmon.c
arch/x86/kernel/crash.c
arch/x86/kernel/kexec-bzimage64.c
drivers/misc/cxl/api.c
drivers/misc/cxl/context.c
drivers/misc/cxl/cxl.h
drivers/misc/cxl/debugfs.c
drivers/misc/cxl/file.c
drivers/misc/cxl/guest.c
drivers/misc/cxl/irq.c
drivers/misc/cxl/native.c
drivers/misc/cxl/pci.c
drivers/misc/cxl/phb.c
drivers/vfio/vfio_iommu_spapr_tce.c
include/linux/kexec.h
kernel/kexec_file.c
kernel/kexec_internal.h

index b49062b060d2027d1af658fff0f81e0a8649c697..48001e754ff2687bf379d755a078850266fd79b3 100644 (file)
@@ -163,6 +163,7 @@ config PPC
        select HAVE_VIRT_CPU_ACCOUNTING
        select HAVE_ARCH_HARDENED_USERCOPY
        select HAVE_KERNEL_GZIP
+       select HAVE_CC_STACKPROTECTOR
 
 config GENERIC_CSUM
        def_bool CPU_LITTLE_ENDIAN
@@ -396,6 +397,14 @@ config MPROFILE_KERNEL
        depends on PPC64 && CPU_LITTLE_ENDIAN
        def_bool !DISABLE_MPROFILE_KERNEL
 
+config USE_THIN_ARCHIVES
+       bool "Build the kernel using thin archives"
+       default n
+       select THIN_ARCHIVES
+       help
+         Build the kernel using thin archives.
+         If you're unsure say N.
+
 config IOMMU_HELPER
        def_bool PPC64
 
@@ -456,6 +465,19 @@ config KEXEC
          interface is strongly in flux, so no good recommendation can be
          made.
 
+config KEXEC_FILE
+       bool "kexec file based system call"
+       select KEXEC_CORE
+       select BUILD_BIN2C
+       depends on PPC64
+       depends on CRYPTO=y
+       depends on CRYPTO_SHA256=y
+       help
+         This is a new version of the kexec system call. This call is
+         file based and takes in file descriptors as system call arguments
+         for kernel and initramfs as opposed to a list of segments as is the
+         case for the older kexec call.
+
 config RELOCATABLE
        bool "Build a relocatable kernel"
        depends on (PPC64 && !COMPILE_TEST) || (FLATMEM && (44x || FSL_BOOKE))
@@ -499,7 +521,7 @@ config CRASH_DUMP
 
 config FA_DUMP
        bool "Firmware-assisted dump"
-       depends on PPC64 && PPC_RTAS && CRASH_DUMP && KEXEC
+       depends on PPC64 && PPC_RTAS && CRASH_DUMP && KEXEC_CORE
        help
          A robust mechanism to get reliable kernel crash dump with
          assistance from firmware. This approach does not use kexec,
@@ -558,6 +580,13 @@ config ARCH_SPARSEMEM_DEFAULT
 config SYS_SUPPORTS_HUGETLBFS
        bool
 
+config ILLEGAL_POINTER_VALUE
+       hex
+       # This is roughly half way between the top of user space and the bottom
+       # of kernel space, which seems about as good as we can get.
+       default 0x5deadbeef0000000 if PPC64
+       default 0
+
 source "mm/Kconfig"
 
 config ARCH_MEMORY_PROBE
index 20cf770611ec4a9b136a5ccf4eced60b04f48224..949258d412d07193d72bcdc0c253fc500c3bd086 100644 (file)
@@ -366,4 +366,8 @@ config PPC_PTDUMP
 
          If you are unsure, say N.
 
+config PPC_HTDUMP
+       def_bool y
+       depends on PPC_PTDUMP && PPC_BOOK3S
+
 endmenu
index 041fda1e2a5db178892ed1effaf3424b62a72824..31286fa7873c1df915814b9bc62156b140006cd2 100644 (file)
@@ -23,7 +23,7 @@ CROSS32AR             := $(CROSS32_COMPILE)ar
 ifeq ($(HAS_BIARCH),y)
 ifeq ($(CROSS32_COMPILE),)
 CROSS32CC      := $(CC) -m32
-CROSS32AR      := GNUTARGET=elf32-powerpc $(AR)
+KBUILD_ARFLAGS += --target=elf32-powerpc
 endif
 endif
 
@@ -85,7 +85,7 @@ ifeq ($(HAS_BIARCH),y)
 override AS    += -a$(BITS)
 override LD    += -m elf$(BITS)$(LDEMULATION)
 override CC    += -m$(BITS)
-override AR    := GNUTARGET=elf$(BITS)-$(GNUTARGET) $(AR)
+KBUILD_ARFLAGS += --target=elf$(BITS)-$(GNUTARGET)
 endif
 
 LDFLAGS_vmlinux-y := -Bstatic
@@ -250,6 +250,7 @@ core-y                              += arch/powerpc/kernel/ \
 core-$(CONFIG_XMON)            += arch/powerpc/xmon/
 core-$(CONFIG_KVM)             += arch/powerpc/kvm/
 core-$(CONFIG_PERF_EVENTS)     += arch/powerpc/perf/
+core-$(CONFIG_KEXEC_FILE)      += arch/powerpc/purgatory/
 
 drivers-$(CONFIG_OPROFILE)     += arch/powerpc/oprofile/
 
@@ -276,16 +277,16 @@ zImage: relocs_check
 endif
 
 $(BOOT_TARGETS1): vmlinux
-       $(Q)$(MAKE) ARCH=ppc64 $(build)=$(boot) $(patsubst %,$(boot)/%,$@)
+       $(Q)$(MAKE) $(build)=$(boot) $(patsubst %,$(boot)/%,$@)
 $(BOOT_TARGETS2): vmlinux
-       $(Q)$(MAKE) ARCH=ppc64 $(build)=$(boot) $(patsubst %,$(boot)/%,$@)
+       $(Q)$(MAKE) $(build)=$(boot) $(patsubst %,$(boot)/%,$@)
 
 
 bootwrapper_install:
-       $(Q)$(MAKE) ARCH=ppc64 $(build)=$(boot) $(patsubst %,$(boot)/%,$@)
+       $(Q)$(MAKE) $(build)=$(boot) $(patsubst %,$(boot)/%,$@)
 
 %.dtb: scripts
-       $(Q)$(MAKE) ARCH=ppc64 $(build)=$(boot) $(patsubst %,$(boot)/%,$@)
+       $(Q)$(MAKE) $(build)=$(boot) $(patsubst %,$(boot)/%,$@)
 
 # Used to create 'merged defconfigs'
 # To use it $(call) it with the first argument as the base defconfig
index eae2dc8bc218165242f4406df9a6273755b60770..1d41d5a9d05ed128775adfbce9a5e278b07ad654 100644 (file)
@@ -171,10 +171,6 @@ $(addprefix $(obj)/,$(libfdt) $(libfdtheader)): $(obj)/%: $(srctree)/scripts/dtc
 $(obj)/empty.c:
        $(Q)touch $@
 
-$(obj)/zImage.lds: $(obj)/%: $(srctree)/$(src)/%.S
-       $(CROSS32CC) $(cpp_flags) -E -Wp,-MD,$(depfile) -P -Upowerpc \
-               -D__ASSEMBLY__ -DLINKER_SCRIPT -o $@ $<
-
 $(obj)/zImage.coff.lds $(obj)/zImage.ps3.lds : $(obj)/%: $(srctree)/$(src)/%.S
        $(Q)cp $< $@
 
@@ -356,17 +352,17 @@ $(addprefix $(obj)/, $(initrd-y)): $(obj)/ramdisk.image.gz
 # Don't put the ramdisk on the pattern rule; when its missing make will try
 # the pattern rule with less dependencies that also matches (even with the
 # hard dependency listed).
-$(obj)/zImage.initrd.%: vmlinux $(wrapperbits)
+$(obj)/zImage.initrd.%: vmlinux $(wrapperbits) FORCE
        $(call if_changed,wrap,$*,,,$(obj)/ramdisk.image.gz)
 
-$(addprefix $(obj)/, $(sort $(filter zImage.%, $(image-y)))): vmlinux $(wrapperbits)
+$(addprefix $(obj)/, $(sort $(filter zImage.%, $(image-y)))): vmlinux $(wrapperbits) FORCE
        $(call if_changed,wrap,$(subst $(obj)/zImage.,,$@))
 
 # dtbImage% - a dtbImage is a zImage with an embedded device tree blob
-$(obj)/dtbImage.initrd.%: vmlinux $(wrapperbits) $(obj)/%.dtb
+$(obj)/dtbImage.initrd.%: vmlinux $(wrapperbits) $(obj)/%.dtb FORCE
        $(call if_changed,wrap,$*,,$(obj)/$*.dtb,$(obj)/ramdisk.image.gz)
 
-$(obj)/dtbImage.%: vmlinux $(wrapperbits) $(obj)/%.dtb
+$(obj)/dtbImage.%: vmlinux $(wrapperbits) $(obj)/%.dtb FORCE
        $(call if_changed,wrap,$*,,$(obj)/$*.dtb)
 
 # This cannot be in the root of $(src) as the zImage rule always adds a $(obj)
@@ -374,31 +370,31 @@ $(obj)/dtbImage.%: vmlinux $(wrapperbits) $(obj)/%.dtb
 $(obj)/vmlinux.strip: vmlinux
        $(STRIP) -s -R .comment $< -o $@
 
-$(obj)/uImage: vmlinux $(wrapperbits)
+$(obj)/uImage: vmlinux $(wrapperbits) FORCE
        $(call if_changed,wrap,uboot)
 
-$(obj)/uImage.initrd.%: vmlinux $(obj)/%.dtb $(wrapperbits)
+$(obj)/uImage.initrd.%: vmlinux $(obj)/%.dtb $(wrapperbits) FORCE
        $(call if_changed,wrap,uboot-$*,,$(obj)/$*.dtb,$(obj)/ramdisk.image.gz)
 
-$(obj)/uImage.%: vmlinux $(obj)/%.dtb $(wrapperbits)
+$(obj)/uImage.%: vmlinux $(obj)/%.dtb $(wrapperbits) FORCE
        $(call if_changed,wrap,uboot-$*,,$(obj)/$*.dtb)
 
-$(obj)/cuImage.initrd.%: vmlinux $(obj)/%.dtb $(wrapperbits)
+$(obj)/cuImage.initrd.%: vmlinux $(obj)/%.dtb $(wrapperbits) FORCE
        $(call if_changed,wrap,cuboot-$*,,$(obj)/$*.dtb,$(obj)/ramdisk.image.gz)
 
-$(obj)/cuImage.%: vmlinux $(obj)/%.dtb $(wrapperbits)
+$(obj)/cuImage.%: vmlinux $(obj)/%.dtb $(wrapperbits) FORCE
        $(call if_changed,wrap,cuboot-$*,,$(obj)/$*.dtb)
 
-$(obj)/simpleImage.initrd.%: vmlinux $(obj)/%.dtb $(wrapperbits)
+$(obj)/simpleImage.initrd.%: vmlinux $(obj)/%.dtb $(wrapperbits) FORCE
        $(call if_changed,wrap,simpleboot-$*,,$(obj)/$*.dtb,$(obj)/ramdisk.image.gz)
 
-$(obj)/simpleImage.%: vmlinux $(obj)/%.dtb $(wrapperbits)
+$(obj)/simpleImage.%: vmlinux $(obj)/%.dtb $(wrapperbits) FORCE
        $(call if_changed,wrap,simpleboot-$*,,$(obj)/$*.dtb)
 
-$(obj)/treeImage.initrd.%: vmlinux $(obj)/%.dtb $(wrapperbits)
+$(obj)/treeImage.initrd.%: vmlinux $(obj)/%.dtb $(wrapperbits) FORCE
        $(call if_changed,wrap,treeboot-$*,,$(obj)/$*.dtb,$(obj)/ramdisk.image.gz)
 
-$(obj)/treeImage.%: vmlinux $(obj)/%.dtb $(wrapperbits)
+$(obj)/treeImage.%: vmlinux $(obj)/%.dtb $(wrapperbits) FORCE
        $(call if_changed,wrap,treeboot-$*,,$(obj)/$*.dtb)
 
 # Rule to build device tree blobs
index b6fcbaf5027bedc23c6da663b79453b08f5be4f2..3dc44b05fb9724a9d2761233c6d4bad54c6258cc 100644 (file)
@@ -57,11 +57,6 @@ __system_reset_overlay:
        bctr
 
 1:
-       /* Save the value at addr zero for a null pointer write check later. */
-
-       li      r4, 0
-       lwz     r3, 0(r4)
-
        /* Primary delays then goes to _zimage_start in wrapper. */
 
        or      31, 31, 31 /* db16cyc */
index 4ec2d86d3c50571a2a62f27c31f00739595ed219..a05558a7e51ad3b2698e8cf6f1acb8922ff5bfa0 100644 (file)
@@ -119,13 +119,12 @@ void ps3_copy_vectors(void)
        flush_cache((void *)0x100, 512);
 }
 
-void platform_init(unsigned long null_check)
+void platform_init(void)
 {
        const u32 heapsize = 0x1000000 - (u32)_end; /* 16MiB */
        void *chosen;
        unsigned long ft_addr;
        u64 rm_size;
-       unsigned long val;
 
        console_ops.write = ps3_console_write;
        platform_ops.exit = ps3_exit;
@@ -153,11 +152,6 @@ void platform_init(unsigned long null_check)
 
        printf(" flat tree at 0x%lx\n\r", ft_addr);
 
-       val = *(unsigned long *)0;
-
-       if (val != null_check)
-               printf("null check failed: %lx != %lx\n\r", val, null_check);
-
        ((kernel_entry_t)0)(ft_addr, 0, NULL);
 
        ps3_exit();
index 404b3aabdb4dcda41e2e06f2b90d9c73b2b3fe39..76fe3ccfd381d6e9099f5490c36145d36abaa296 100755 (executable)
@@ -181,6 +181,28 @@ case "$elfformat" in
     elf32-powerpc)     format=elf32ppc ;;
 esac
 
+ld_version()
+{
+    # Poached from scripts/ld-version.sh, but we don't want to call that because
+    # this script (wrapper) is distributed separately from the kernel source.
+    # Extract linker version number from stdin and turn into single number.
+    awk '{
+       gsub(".*\\)", "");
+       gsub(".*version ", "");
+       gsub("-.*", "");
+       split($1,a, ".");
+       print a[1]*100000000 + a[2]*1000000 + a[3]*10000;
+       exit
+    }'
+}
+
+# Do not include PT_INTERP segment when linking pie. Non-pie linking
+# just ignores this option.
+LD_VERSION=$(${CROSS}ld --version | ld_version)
+LD_NO_DL_MIN_VERSION=$(echo 2.26 | ld_version)
+if [ "$LD_VERSION" -ge "$LD_NO_DL_MIN_VERSION" ] ; then
+       nodl="--no-dynamic-linker"
+fi
 
 platformo=$object/"$platform".o
 lds=$object/zImage.lds
@@ -446,7 +468,7 @@ if [ "$platform" != "miboot" ]; then
         text_start="-Ttext $link_address"
     fi
 #link everything
-    ${CROSS}ld -m $format -T $lds $text_start $pie -o "$ofile" \
+    ${CROSS}ld -m $format -T $lds $text_start $pie $nodl -o "$ofile" \
        $platformo $tmp $object/wrapper.a
     rm $tmp
 fi
index d77af0eca967c700b269c84bd17400af890131d0..e4d53fe5976ae9a3eb1536e3562e99ffc9688321 100644 (file)
@@ -49,6 +49,7 @@ CONFIG_BINFMT_MISC=m
 CONFIG_PPC_TRANSACTIONAL_MEM=y
 CONFIG_HOTPLUG_CPU=y
 CONFIG_KEXEC=y
+CONFIG_KEXEC_FILE=y
 CONFIG_IRQ_ALL_CPUS=y
 CONFIG_NUMA=y
 CONFIG_MEMORY_HOTPLUG=y
@@ -296,7 +297,10 @@ CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_CCM=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_HMAC=y
+CONFIG_CRYPT_CRC32C_VPMSUM=m
+CONFIG_CRYPTO_MD5_PPC=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
+CONFIG_CRYPTO_SHA256=y
 CONFIG_CRYPTO_TGR192=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_ANUBIS=m
@@ -304,6 +308,7 @@ CONFIG_CRYPTO_BLOWFISH=m
 CONFIG_CRYPTO_CAST6=m
 CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SALSA20=m
+CONFIG_CRYPTO_SHA1_PPC=m
 CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_TEA=m
 CONFIG_CRYPTO_TWOFISH=m
index df5f1616375fae1c8d046c8c9e368f9aa92f6dba..0396126ba6a83011b7076dfac60d4408381f44a8 100644 (file)
@@ -46,6 +46,7 @@ CONFIG_HZ_100=y
 CONFIG_BINFMT_MISC=m
 CONFIG_PPC_TRANSACTIONAL_MEM=y
 CONFIG_KEXEC=y
+CONFIG_KEXEC_FILE=y
 CONFIG_CRASH_DUMP=y
 CONFIG_IRQ_ALL_CPUS=y
 CONFIG_MEMORY_HOTREMOVE=y
@@ -332,7 +333,10 @@ CONFIG_PPC_EARLY_DEBUG=y
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_HMAC=y
+CONFIG_CRYPT_CRC32C_VPMSUM=m
+CONFIG_CRYPTO_MD5_PPC=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
+CONFIG_CRYPTO_SHA256=y
 CONFIG_CRYPTO_TGR192=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_ANUBIS=m
@@ -340,6 +344,7 @@ CONFIG_CRYPTO_BLOWFISH=m
 CONFIG_CRYPTO_CAST6=m
 CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SALSA20=m
+CONFIG_CRYPTO_SHA1_PPC=m
 CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_TEA=m
 CONFIG_CRYPTO_TWOFISH=m
index 536e811109362929884b4197f4a21fa0316d26ab..5a06bdde167433984c6c2aac624292e6d35df89a 100644 (file)
@@ -52,6 +52,7 @@ CONFIG_HZ_100=y
 CONFIG_BINFMT_MISC=m
 CONFIG_PPC_TRANSACTIONAL_MEM=y
 CONFIG_KEXEC=y
+CONFIG_KEXEC_FILE=y
 CONFIG_IRQ_ALL_CPUS=y
 CONFIG_MEMORY_HOTPLUG=y
 CONFIG_MEMORY_HOTREMOVE=y
@@ -295,7 +296,10 @@ CONFIG_XMON=y
 CONFIG_CRYPTO_TEST=m
 CONFIG_CRYPTO_PCBC=m
 CONFIG_CRYPTO_HMAC=y
+CONFIG_CRYPT_CRC32C_VPMSUM=m
+CONFIG_CRYPTO_MD5_PPC=m
 CONFIG_CRYPTO_MICHAEL_MIC=m
+CONFIG_CRYPTO_SHA256=y
 CONFIG_CRYPTO_TGR192=m
 CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_ANUBIS=m
@@ -303,6 +307,7 @@ CONFIG_CRYPTO_BLOWFISH=m
 CONFIG_CRYPTO_CAST6=m
 CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SALSA20=m
+CONFIG_CRYPTO_SHA1_PPC=m
 CONFIG_CRYPTO_SERPENT=m
 CONFIG_CRYPTO_TEA=m
 CONFIG_CRYPTO_TWOFISH=m
index d1492736d85223d54913eef6c2014d915cdb7f78..dfef1174663eba08ca70b7d654efc5874792c37e 100644 (file)
@@ -13,8 +13,6 @@
  */
 
 #include <linux/threads.h>
-#include <linux/kprobes.h>
-
 #include <uapi/asm/ucontext.h>
 
 /* SMP */
index 388b0522f748a46f47cf293c74c55f7be2b0c4c3..012223638815569bb424e58021f3f9a7196ff0f0 100644 (file)
@@ -233,7 +233,8 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
 
 
 static inline void __ptep_set_access_flags(struct mm_struct *mm,
-                                          pte_t *ptep, pte_t entry)
+                                          pte_t *ptep, pte_t entry,
+                                          unsigned long address)
 {
        unsigned long set = pte_val(entry) &
                (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC);
index 1af837c561bae9d74152a048fc068fac299f8a37..1c64bc6330bc0b9f70b71038e678fd110e6e1456 100644 (file)
@@ -16,9 +16,6 @@
 #define H_PUD_TABLE_SIZE       (sizeof(pud_t) << H_PUD_INDEX_SIZE)
 #define H_PGD_TABLE_SIZE       (sizeof(pgd_t) << H_PGD_INDEX_SIZE)
 
-/* With 4k base page size, hugepage PTEs go at the PMD level */
-#define MIN_HUGEPTE_SHIFT      PMD_SHIFT
-
 /* PTE flags to conserve for HPTE identification */
 #define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_HASHPTE | \
                         H_PAGE_F_SECOND | H_PAGE_F_GIX)
index 5aae4f530c21d7ceb2ae8c190d26030d4da0172d..f3dd21efa2ead0e059fcfcca0056a20c22d56131 100644 (file)
@@ -6,9 +6,6 @@
 #define H_PUD_INDEX_SIZE  5
 #define H_PGD_INDEX_SIZE  12
 
-/* With 4k base page size, hugepage PTEs go at the PMD level */
-#define MIN_HUGEPTE_SHIFT      PAGE_SHIFT
-
 #define H_PAGE_COMBO   0x00001000 /* this is a combo 4k page */
 #define H_PAGE_4K_PFN  0x00002000 /* PFN is for a single 4k page */
 /*
diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb-radix.h b/arch/powerpc/include/asm/book3s/64/hugetlb-radix.h
deleted file mode 100644 (file)
index c45189a..0000000
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef _ASM_POWERPC_BOOK3S_64_HUGETLB_RADIX_H
-#define _ASM_POWERPC_BOOK3S_64_HUGETLB_RADIX_H
-/*
- * For radix we want generic code to handle hugetlb. But then if we want
- * both hash and radix to be enabled together we need to workaround the
- * limitations.
- */
-void radix__flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
-void radix__local_flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
-extern unsigned long
-radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
-                               unsigned long len, unsigned long pgoff,
-                               unsigned long flags);
-
-static inline int hstate_get_psize(struct hstate *hstate)
-{
-       unsigned long shift;
-
-       shift = huge_page_shift(hstate);
-       if (shift == mmu_psize_defs[MMU_PAGE_2M].shift)
-               return MMU_PAGE_2M;
-       else if (shift == mmu_psize_defs[MMU_PAGE_1G].shift)
-               return MMU_PAGE_1G;
-       else {
-               WARN(1, "Wrong huge page shift\n");
-               return mmu_virtual_psize;
-       }
-}
-#endif
diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h
new file mode 100644 (file)
index 0000000..c62f14d
--- /dev/null
@@ -0,0 +1,53 @@
+#ifndef _ASM_POWERPC_BOOK3S_64_HUGETLB_H
+#define _ASM_POWERPC_BOOK3S_64_HUGETLB_H
+/*
+ * For radix we want generic code to handle hugetlb. But then if we want
+ * both hash and radix to be enabled together we need to workaround the
+ * limitations.
+ */
+void radix__flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
+void radix__local_flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
+extern unsigned long
+radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+                               unsigned long len, unsigned long pgoff,
+                               unsigned long flags);
+
+static inline int hstate_get_psize(struct hstate *hstate)
+{
+       unsigned long shift;
+
+       shift = huge_page_shift(hstate);
+       if (shift == mmu_psize_defs[MMU_PAGE_2M].shift)
+               return MMU_PAGE_2M;
+       else if (shift == mmu_psize_defs[MMU_PAGE_1G].shift)
+               return MMU_PAGE_1G;
+       else if (shift == mmu_psize_defs[MMU_PAGE_16M].shift)
+               return MMU_PAGE_16M;
+       else if (shift == mmu_psize_defs[MMU_PAGE_16G].shift)
+               return MMU_PAGE_16G;
+       else {
+               WARN(1, "Wrong huge page shift\n");
+               return mmu_virtual_psize;
+       }
+}
+
+#define arch_make_huge_pte arch_make_huge_pte
+static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
+                                      struct page *page, int writable)
+{
+       unsigned long page_shift;
+
+       if (!cpu_has_feature(CPU_FTR_POWER9_DD1))
+               return entry;
+
+       page_shift = huge_page_shift(hstate_vma(vma));
+       /*
+        * We don't support 1G hugetlb pages yet.
+        */
+       VM_WARN_ON(page_shift == mmu_psize_defs[MMU_PAGE_1G].shift);
+       if (page_shift == mmu_psize_defs[MMU_PAGE_2M].shift)
+               return __pte(pte_val(entry) | _PAGE_LARGE);
+       else
+               return entry;
+}
+#endif
index e407af2b7333500a4ebcc319c5337827abe34509..2e6a823fa502724f6c3a7fb17312b21514ef2868 100644 (file)
@@ -70,7 +70,9 @@
 
 #define HPTE_V_SSIZE_SHIFT     62
 #define HPTE_V_AVPN_SHIFT      7
+#define HPTE_V_COMMON_BITS     ASM_CONST(0x000fffffffffffff)
 #define HPTE_V_AVPN            ASM_CONST(0x3fffffffffffff80)
+#define HPTE_V_AVPN_3_0                ASM_CONST(0x000fffffffffff80)
 #define HPTE_V_AVPN_VAL(x)     (((x) & HPTE_V_AVPN) >> HPTE_V_AVPN_SHIFT)
 #define HPTE_V_COMPARE(x,y)    (!(((x) ^ (y)) & 0xffffffffffffff80UL))
 #define HPTE_V_BOLTED          ASM_CONST(0x0000000000000010)
 #define HPTE_V_VALID           ASM_CONST(0x0000000000000001)
 
 /*
- * ISA 3.0 have a different HPTE format.
+ * ISA 3.0 has a different HPTE format.
  */
 #define HPTE_R_3_0_SSIZE_SHIFT 58
+#define HPTE_R_3_0_SSIZE_MASK  (3ull << HPTE_R_3_0_SSIZE_SHIFT)
 #define HPTE_R_PP0             ASM_CONST(0x8000000000000000)
 #define HPTE_R_TS              ASM_CONST(0x4000000000000000)
 #define HPTE_R_KEY_HI          ASM_CONST(0x3000000000000000)
 #define HPTE_R_RPN_SHIFT       12
 #define HPTE_R_RPN             ASM_CONST(0x0ffffffffffff000)
+#define HPTE_R_RPN_3_0         ASM_CONST(0x01fffffffffff000)
 #define HPTE_R_PP              ASM_CONST(0x0000000000000003)
 #define HPTE_R_PPP             ASM_CONST(0x8000000000000003)
 #define HPTE_R_N               ASM_CONST(0x0000000000000004)
@@ -316,11 +320,42 @@ static inline unsigned long hpte_encode_avpn(unsigned long vpn, int psize,
         */
        v = (vpn >> (23 - VPN_SHIFT)) & ~(mmu_psize_defs[psize].avpnm);
        v <<= HPTE_V_AVPN_SHIFT;
-       if (!cpu_has_feature(CPU_FTR_ARCH_300))
-               v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT;
+       v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT;
        return v;
 }
 
+/*
+ * ISA v3.0 defines a new HPTE format, which differs from the old
+ * format in having smaller AVPN and ARPN fields, and the B field
+ * in the second dword instead of the first.
+ */
+static inline unsigned long hpte_old_to_new_v(unsigned long v)
+{
+       /* trim AVPN, drop B */
+       return v & HPTE_V_COMMON_BITS;
+}
+
+static inline unsigned long hpte_old_to_new_r(unsigned long v, unsigned long r)
+{
+       /* move B field from 1st to 2nd dword, trim ARPN */
+       return (r & ~HPTE_R_3_0_SSIZE_MASK) |
+               (((v) >> HPTE_V_SSIZE_SHIFT) << HPTE_R_3_0_SSIZE_SHIFT);
+}
+
+static inline unsigned long hpte_new_to_old_v(unsigned long v, unsigned long r)
+{
+       /* insert B field */
+       return (v & HPTE_V_COMMON_BITS) |
+               ((r & HPTE_R_3_0_SSIZE_MASK) <<
+                (HPTE_V_SSIZE_SHIFT - HPTE_R_3_0_SSIZE_SHIFT));
+}
+
+static inline unsigned long hpte_new_to_old_r(unsigned long r)
+{
+       /* clear out B field */
+       return r & ~HPTE_R_3_0_SSIZE_MASK;
+}
+
 /*
  * This function sets the AVPN and L fields of the HPTE  appropriately
  * using the base page size and actual page size.
@@ -341,12 +376,8 @@ static inline unsigned long hpte_encode_v(unsigned long vpn, int base_psize,
  * aligned for the requested page size
  */
 static inline unsigned long hpte_encode_r(unsigned long pa, int base_psize,
-                                         int actual_psize, int ssize)
+                                         int actual_psize)
 {
-
-       if (cpu_has_feature(CPU_FTR_ARCH_300))
-               pa |= ((unsigned long) ssize) << HPTE_R_3_0_SSIZE_SHIFT;
-
        /* A 4K page needs no special encoding */
        if (actual_psize == MMU_PAGE_4K)
                return pa & HPTE_R_RPN;
index 0a46a5f2a73919f18c3326bc26b9e4a26061b921..6cfc5dbdae032bc8f2b5b7ca6541074dd02afe6a 100644 (file)
 #define _RPAGE_SW1             0x00800
 #define _RPAGE_SW2             0x00400
 #define _RPAGE_SW3             0x00200
+#define _RPAGE_RSV1            0x1000000000000000UL
+#define _RPAGE_RSV2            0x0800000000000000UL
+#define _RPAGE_RSV3            0x0400000000000000UL
+#define _RPAGE_RSV4            0x0200000000000000UL
+
 #ifdef CONFIG_MEM_SOFT_DIRTY
 #define _PAGE_SOFT_DIRTY       _RPAGE_SW3 /* software: software dirty tracking */
 #else
 #endif
 #define _PAGE_SPECIAL          _RPAGE_SW2 /* software: special page */
 
+/*
+ * For P9 DD1 only, we need to track whether the pte's huge.
+ */
+#define _PAGE_LARGE    _RPAGE_RSV1
+
 
 #define _PAGE_PTE              (1ul << 62)     /* distinguishes PTEs from pointers */
 #define _PAGE_PRESENT          (1ul << 63)     /* pte contains a translation */
@@ -568,10 +578,11 @@ static inline bool check_pte_access(unsigned long access, unsigned long ptev)
  */
 
 static inline void __ptep_set_access_flags(struct mm_struct *mm,
-                                          pte_t *ptep, pte_t entry)
+                                          pte_t *ptep, pte_t entry,
+                                          unsigned long address)
 {
        if (radix_enabled())
-               return radix__ptep_set_access_flags(mm, ptep, entry);
+               return radix__ptep_set_access_flags(mm, ptep, entry, address);
        return hash__ptep_set_access_flags(ptep, entry);
 }
 
index 2a46dea8e1b18c46299c9cc9f3eef4ed7d4ef35b..b4d1302387a3fee203c6b80a9fcb7b73c9b9136b 100644 (file)
@@ -140,19 +140,20 @@ static inline unsigned long radix__pte_update(struct mm_struct *mm,
                unsigned long new_pte;
 
                old_pte = __radix_pte_update(ptep, ~0, 0);
-               asm volatile("ptesync" : : : "memory");
                /*
                 * new value of pte
                 */
                new_pte = (old_pte | set) & ~clr;
-
                /*
-                * For now let's do heavy pid flush
-                * radix__flush_tlb_page_psize(mm, addr, mmu_virtual_psize);
+                * If we are trying to clear the pte, we can skip
+                * the below sequence and batch the tlb flush. The
+                * tlb flush batching is done by mmu gather code
                 */
-               radix__flush_tlb_mm(mm);
-
-               __radix_pte_update(ptep, 0, new_pte);
+               if (new_pte) {
+                       asm volatile("ptesync" : : : "memory");
+                       radix__flush_tlb_pte_p9_dd1(old_pte, mm, addr);
+                       __radix_pte_update(ptep, 0, new_pte);
+               }
        } else
                old_pte = __radix_pte_update(ptep, clr, set);
        asm volatile("ptesync" : : : "memory");
@@ -167,7 +168,8 @@ static inline unsigned long radix__pte_update(struct mm_struct *mm,
  * function doesn't need to invalidate tlb.
  */
 static inline void radix__ptep_set_access_flags(struct mm_struct *mm,
-                                               pte_t *ptep, pte_t entry)
+                                               pte_t *ptep, pte_t entry,
+                                               unsigned long address)
 {
 
        unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED |
@@ -183,13 +185,7 @@ static inline void radix__ptep_set_access_flags(struct mm_struct *mm,
                 * new value of pte
                 */
                new_pte = old_pte | set;
-
-               /*
-                * For now let's do heavy pid flush
-                * radix__flush_tlb_page_psize(mm, addr, mmu_virtual_psize);
-                */
-               radix__flush_tlb_mm(mm);
-
+               radix__flush_tlb_pte_p9_dd1(old_pte, mm, address);
                __radix_pte_update(ptep, 0, new_pte);
        } else
                __radix_pte_update(ptep, 0, set);
@@ -243,6 +239,8 @@ static inline int radix__pmd_trans_huge(pmd_t pmd)
 
 static inline pmd_t radix__pmd_mkhuge(pmd_t pmd)
 {
+       if (cpu_has_feature(CPU_FTR_POWER9_DD1))
+               return __pmd(pmd_val(pmd) | _PAGE_PTE | _PAGE_LARGE);
        return __pmd(pmd_val(pmd) | _PAGE_PTE);
 }
 static inline void radix__pmdp_huge_split_prepare(struct vm_area_struct *vma,
index a9e19cb2f7c559621cc870d49d25c5e36b062463..cc7fbde4f53cac09b257f9a7ff4f1202af7c3c98 100644 (file)
@@ -42,4 +42,6 @@ extern void radix__flush_tlb_lpid_va(unsigned long lpid, unsigned long gpa,
                                     unsigned long page_size);
 extern void radix__flush_tlb_lpid(unsigned long lpid);
 extern void radix__flush_tlb_all(void);
+extern void radix__flush_tlb_pte_p9_dd1(unsigned long old_pte, struct mm_struct *mm,
+                                       unsigned long address);
 #endif
index 44efe739b6b9b4979f95b3a26d8abf52dcda3c26..fc46b664c49e8adfb58bd4c45fe9f439ee7eb96a 100644 (file)
@@ -7,6 +7,71 @@
 #include <asm/asm-compat.h>
 #include <linux/bug.h>
 
+#ifdef __BIG_ENDIAN
+#define BITOFF_CAL(size, off)  ((sizeof(u32) - size - off) * BITS_PER_BYTE)
+#else
+#define BITOFF_CAL(size, off)  (off * BITS_PER_BYTE)
+#endif
+
+#define XCHG_GEN(type, sfx, cl)                                \
+static inline u32 __xchg_##type##sfx(volatile void *p, u32 val)        \
+{                                                              \
+       unsigned int prev, prev_mask, tmp, bitoff, off;         \
+                                                               \
+       off = (unsigned long)p % sizeof(u32);                   \
+       bitoff = BITOFF_CAL(sizeof(type), off);                 \
+       p -= off;                                               \
+       val <<= bitoff;                                         \
+       prev_mask = (u32)(type)-1 << bitoff;                    \
+                                                               \
+       __asm__ __volatile__(                                   \
+"1:    lwarx   %0,0,%3\n"                                      \
+"      andc    %1,%0,%5\n"                                     \
+"      or      %1,%1,%4\n"                                     \
+       PPC405_ERR77(0,%3)                                      \
+"      stwcx.  %1,0,%3\n"                                      \
+"      bne-    1b\n"                                           \
+       : "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p)            \
+       : "r" (p), "r" (val), "r" (prev_mask)                   \
+       : "cc", cl);                                            \
+                                                               \
+       return prev >> bitoff;                                  \
+}
+
+#define CMPXCHG_GEN(type, sfx, br, br2, cl)                    \
+static inline                                                  \
+u32 __cmpxchg_##type##sfx(volatile void *p, u32 old, u32 new)  \
+{                                                              \
+       unsigned int prev, prev_mask, tmp, bitoff, off;         \
+                                                               \
+       off = (unsigned long)p % sizeof(u32);                   \
+       bitoff = BITOFF_CAL(sizeof(type), off);                 \
+       p -= off;                                               \
+       old <<= bitoff;                                         \
+       new <<= bitoff;                                         \
+       prev_mask = (u32)(type)-1 << bitoff;                    \
+                                                               \
+       __asm__ __volatile__(                                   \
+       br                                                      \
+"1:    lwarx   %0,0,%3\n"                                      \
+"      and     %1,%0,%6\n"                                     \
+"      cmpw    0,%1,%4\n"                                      \
+"      bne-    2f\n"                                           \
+"      andc    %1,%0,%6\n"                                     \
+"      or      %1,%1,%5\n"                                     \
+       PPC405_ERR77(0,%3)                                      \
+"      stwcx.  %1,0,%3\n"                                      \
+"      bne-    1b\n"                                           \
+       br2                                                     \
+       "\n"                                                    \
+"2:"                                                           \
+       : "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p)            \
+       : "r" (p), "r" (old), "r" (new), "r" (prev_mask)        \
+       : "cc", cl);                                            \
+                                                               \
+       return prev >> bitoff;                                  \
+}
+
 /*
  * Atomic exchange
  *
  * the previous value stored there.
  */
 
+XCHG_GEN(u8, _local, "memory");
+XCHG_GEN(u8, _relaxed, "cc");
+XCHG_GEN(u16, _local, "memory");
+XCHG_GEN(u16, _relaxed, "cc");
+
 static __always_inline unsigned long
 __xchg_u32_local(volatile void *p, unsigned long val)
 {
@@ -85,9 +155,13 @@ __xchg_u64_relaxed(u64 *p, unsigned long val)
 #endif
 
 static __always_inline unsigned long
-__xchg_local(volatile void *ptr, unsigned long x, unsigned int size)
+__xchg_local(void *ptr, unsigned long x, unsigned int size)
 {
        switch (size) {
+       case 1:
+               return __xchg_u8_local(ptr, x);
+       case 2:
+               return __xchg_u16_local(ptr, x);
        case 4:
                return __xchg_u32_local(ptr, x);
 #ifdef CONFIG_PPC64
@@ -103,6 +177,10 @@ static __always_inline unsigned long
 __xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
 {
        switch (size) {
+       case 1:
+               return __xchg_u8_relaxed(ptr, x);
+       case 2:
+               return __xchg_u16_relaxed(ptr, x);
        case 4:
                return __xchg_u32_relaxed(ptr, x);
 #ifdef CONFIG_PPC64
@@ -131,6 +209,15 @@ __xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
  * and return the old value of *p.
  */
 
+CMPXCHG_GEN(u8, , PPC_ATOMIC_ENTRY_BARRIER, PPC_ATOMIC_EXIT_BARRIER, "memory");
+CMPXCHG_GEN(u8, _local, , , "memory");
+CMPXCHG_GEN(u8, _acquire, , PPC_ACQUIRE_BARRIER, "memory");
+CMPXCHG_GEN(u8, _relaxed, , , "cc");
+CMPXCHG_GEN(u16, , PPC_ATOMIC_ENTRY_BARRIER, PPC_ATOMIC_EXIT_BARRIER, "memory");
+CMPXCHG_GEN(u16, _local, , , "memory");
+CMPXCHG_GEN(u16, _acquire, , PPC_ACQUIRE_BARRIER, "memory");
+CMPXCHG_GEN(u16, _relaxed, , , "cc");
+
 static __always_inline unsigned long
 __cmpxchg_u32(volatile unsigned int *p, unsigned long old, unsigned long new)
 {
@@ -316,6 +403,10 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
          unsigned int size)
 {
        switch (size) {
+       case 1:
+               return __cmpxchg_u8(ptr, old, new);
+       case 2:
+               return __cmpxchg_u16(ptr, old, new);
        case 4:
                return __cmpxchg_u32(ptr, old, new);
 #ifdef CONFIG_PPC64
@@ -328,10 +419,14 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
 }
 
 static __always_inline unsigned long
-__cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new,
+__cmpxchg_local(void *ptr, unsigned long old, unsigned long new,
          unsigned int size)
 {
        switch (size) {
+       case 1:
+               return __cmpxchg_u8_local(ptr, old, new);
+       case 2:
+               return __cmpxchg_u16_local(ptr, old, new);
        case 4:
                return __cmpxchg_u32_local(ptr, old, new);
 #ifdef CONFIG_PPC64
@@ -348,6 +443,10 @@ __cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new,
                  unsigned int size)
 {
        switch (size) {
+       case 1:
+               return __cmpxchg_u8_relaxed(ptr, old, new);
+       case 2:
+               return __cmpxchg_u16_relaxed(ptr, old, new);
        case 4:
                return __cmpxchg_u32_relaxed(ptr, old, new);
 #ifdef CONFIG_PPC64
@@ -364,6 +463,10 @@ __cmpxchg_acquire(void *ptr, unsigned long old, unsigned long new,
                  unsigned int size)
 {
        switch (size) {
+       case 1:
+               return __cmpxchg_u8_acquire(ptr, old, new);
+       case 2:
+               return __cmpxchg_u16_acquire(ptr, old, new);
        case 4:
                return __cmpxchg_u32_acquire(ptr, old, new);
 #ifdef CONFIG_PPC64
index a954e4975049c5632059e13535932ae0c8e7587c..86308f177f2d8a62cc718f0de528c727ad5ab656 100644 (file)
@@ -10,7 +10,7 @@ struct pt_regs;
 
 extern struct dentry *powerpc_debugfs_root;
 
-#if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC)
+#if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC_CORE)
 
 extern int (*__debugger)(struct pt_regs *regs);
 extern int (*__debugger_ipi)(struct pt_regs *regs);
index 3facdd41709c9b2be0b3342905aef3aae33fda7a..ede215167d1ad4f37f4d732c0eeaeee5cbd58bd4 100644 (file)
@@ -9,7 +9,7 @@ extern struct kmem_cache *hugepte_cache;
 
 #ifdef CONFIG_PPC_BOOK3S_64
 
-#include <asm/book3s/64/hugetlb-radix.h>
+#include <asm/book3s/64/hugetlb.h>
 /*
  * This should work for other subarchs too. But right now we use the
  * new format only for 64bit book3s
index 8d5f8352afd70d8674ec05562e636f96ab46bc40..77ff1ba99d1f660991baad2a5f33c55a939aed4e 100644 (file)
 #define H_GET_MPP_X            0x314
 #define H_SET_MODE             0x31C
 #define H_CLEAR_HPT            0x358
-#define MAX_HCALL_OPCODE       H_CLEAR_HPT
+#define H_SIGNAL_SYS_RESET     0x380
+#define MAX_HCALL_OPCODE       H_SIGNAL_SYS_RESET
 
 /* H_VIOCTL functions */
 #define H_GET_VIOA_DUMP_SIZE   0x01
 #define H_SET_MODE_RESOURCE_ADDR_TRANS_MODE    3
 #define H_SET_MODE_RESOURCE_LE                 4
 
+/* Values for argument to H_SIGNAL_SYS_RESET */
+#define H_SIGNAL_SYS_RESET_ALL                 -1
+#define H_SIGNAL_SYS_RESET_ALL_OTHERS          -2
+/* >= 0 values are CPU number */
+
 #ifndef __ASSEMBLY__
 
 /**
index a46f5f45570c8904a5a13de12ecb3edfed5c2449..6c3b71502fbcbcc6566790b6f910955668bc43aa 100644 (file)
@@ -53,7 +53,7 @@
 
 typedef void (*crash_shutdown_t)(void);
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 
 /*
  * This function is responsible for capturing register states if coming
@@ -91,7 +91,17 @@ static inline bool kdump_in_progress(void)
        return crashing_cpu >= 0;
 }
 
-#else /* !CONFIG_KEXEC */
+#ifdef CONFIG_KEXEC_FILE
+extern struct kexec_file_ops kexec_elf64_ops;
+
+int setup_purgatory(struct kimage *image, const void *slave_code,
+                   const void *fdt, unsigned long kernel_load_addr,
+                   unsigned long fdt_load_addr);
+int setup_new_fdt(void *fdt, unsigned long initrd_load_addr,
+                 unsigned long initrd_len, const char *cmdline);
+#endif /* CONFIG_KEXEC_FILE */
+
+#else /* !CONFIG_KEXEC_CORE */
 static inline void crash_kexec_secondary(struct pt_regs *regs) { }
 
 static inline int overlaps_crashkernel(unsigned long start, unsigned long size)
@@ -116,7 +126,7 @@ static inline bool kdump_in_progress(void)
        return false;
 }
 
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
 #endif /* ! __ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_KEXEC_H */
index 2c9759bdb63bc2853760dcb98f9a1413c8c810c2..97b8c1f83453038c3c594bae8820fa9e7b128ec5 100644 (file)
@@ -32,6 +32,7 @@
 #include <asm/probes.h>
 #include <asm/code-patching.h>
 
+#ifdef CONFIG_KPROBES
 #define  __ARCH_WANT_KPROBES_INSN_SLOT
 
 struct pt_regs;
@@ -127,5 +128,11 @@ struct kprobe_ctlblk {
 extern int kprobe_exceptions_notify(struct notifier_block *self,
                                        unsigned long val, void *data);
 extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
+extern int kprobe_handler(struct pt_regs *regs);
+extern int kprobe_post_handler(struct pt_regs *regs);
+#else
+static inline int kprobe_handler(struct pt_regs *regs) { return 0; }
+static inline int kprobe_post_handler(struct pt_regs *regs) { return 0; }
+#endif /* CONFIG_KPROBES */
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_KPROBES_H */
index e02cbc6a6c704fa7a31b4d46ab22bb79f9546c9a..5011b69107a720b5a52cb4e80ba13019f2757d6b 100644 (file)
@@ -183,7 +183,7 @@ struct machdep_calls {
         */
        void (*machine_shutdown)(void);
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        void (*kexec_cpu_down)(int crash_shutdown, int secondary);
 
        /* Called to do what every setup is needed on image and the
@@ -198,7 +198,7 @@ struct machdep_calls {
         * no return.
         */
        void (*machine_kexec)(struct kimage *image);
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
 
 #ifdef CONFIG_SUSPEND
        /* These are called to disable and enable, respectively, IRQs when
index b119bdd6ed278985dd982cc4c66238c1a7cf561f..09304d2bec03377a96b9128c32084c7475bb56ba 100644 (file)
@@ -208,6 +208,11 @@ extern u64 ppc64_rma_size;
 /* Cleanup function used by kexec */
 extern void mmu_cleanup_all(void);
 extern void radix__mmu_cleanup_all(void);
+
+/* Functions for creating and updating partition table on POWER9 */
+extern void mmu_partition_table_init(void);
+extern void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
+                                         unsigned long dw1);
 #endif /* CONFIG_PPC64 */
 
 struct mm_struct;
index 5c451140660a91b3abc7130040685cea8e3db62d..b9e3f0aca261da2233a086cb41150bc3afacb9dc 100644 (file)
@@ -19,16 +19,18 @@ extern void destroy_context(struct mm_struct *mm);
 struct mm_iommu_table_group_mem_t;
 
 extern int isolate_lru_page(struct page *page);        /* from internal.h */
-extern bool mm_iommu_preregistered(void);
-extern long mm_iommu_get(unsigned long ua, unsigned long entries,
+extern bool mm_iommu_preregistered(struct mm_struct *mm);
+extern long mm_iommu_get(struct mm_struct *mm,
+               unsigned long ua, unsigned long entries,
                struct mm_iommu_table_group_mem_t **pmem);
-extern long mm_iommu_put(struct mm_iommu_table_group_mem_t *mem);
-extern void mm_iommu_init(mm_context_t *ctx);
-extern void mm_iommu_cleanup(mm_context_t *ctx);
-extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(unsigned long ua,
-               unsigned long size);
-extern struct mm_iommu_table_group_mem_t *mm_iommu_find(unsigned long ua,
-               unsigned long entries);
+extern long mm_iommu_put(struct mm_struct *mm,
+               struct mm_iommu_table_group_mem_t *mem);
+extern void mm_iommu_init(struct mm_struct *mm);
+extern void mm_iommu_cleanup(struct mm_struct *mm);
+extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
+               unsigned long ua, unsigned long size);
+extern struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
+               unsigned long ua, unsigned long entries);
 extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
                unsigned long ua, unsigned long *hpa);
 extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem);
index 7bd916e91295510259fe000c2b9d1291c7c41132..ba9921bf202e0c7f2d8579dfc6f31f25ad7cebd7 100644 (file)
@@ -275,7 +275,8 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
 
 
 static inline void __ptep_set_access_flags(struct mm_struct *mm,
-                                          pte_t *ptep, pte_t entry)
+                                          pte_t *ptep, pte_t entry,
+                                          unsigned long address)
 {
        unsigned long set = pte_val(entry) &
                (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC);
index fc7d51753f8111f77bc030ef99a1c3c79ef47ebd..d0db98793dd83d0ddf5e8d60be2688e697e74491 100644 (file)
@@ -27,9 +27,6 @@
 #define PMD_SIZE       (1UL << PMD_SHIFT)
 #define PMD_MASK       (~(PMD_SIZE-1))
 
-/* With 4k base page size, hugepage PTEs go at the PMD level */
-#define MIN_HUGEPTE_SHIFT      PMD_SHIFT
-
 /* PUD_SHIFT determines what a third-level page table entry can map */
 #define PUD_SHIFT      (PMD_SHIFT + PMD_INDEX_SIZE)
 #define PUD_SIZE       (1UL << PUD_SHIFT)
index 908324574f7704f12f5c6862a3332a1bc0139a1a..55b28ef3409af5494a521b8a948966947555a84d 100644 (file)
@@ -31,9 +31,6 @@
 #define PTRS_PER_PMD   (1 << PMD_INDEX_SIZE)
 #define PTRS_PER_PGD   (1 << PGD_INDEX_SIZE)
 
-/* With 4k base page size, hugepage PTEs go at the PMD level */
-#define MIN_HUGEPTE_SHIFT      PAGE_SHIFT
-
 /* PMD_SHIFT determines what a second-level page table entry can map */
 #define PMD_SHIFT      (PAGE_SHIFT + PTE_INDEX_SIZE)
 #define PMD_SIZE       (1UL << PMD_SHIFT)
index 6c4a14292a9e1d6d14be3aab4758a6081eb0f6bf..c7f927e67d14c54825019529df9cda03d619bf75 100644 (file)
@@ -289,7 +289,8 @@ static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
  * function doesn't need to flush the hash entry
  */
 static inline void __ptep_set_access_flags(struct mm_struct *mm,
-                                          pte_t *ptep, pte_t entry)
+                                          pte_t *ptep, pte_t entry,
+                                          unsigned long address)
 {
        unsigned long bits = pte_val(entry) &
                (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC);
index e958b7096f19c5ea703816e935bd4ba434a370ad..5c7db0f1a7087290f5b0610d9a492987222f0a1e 100644 (file)
@@ -220,9 +220,12 @@ int64_t opal_pci_set_power_state(uint64_t async_token, uint64_t id,
 int64_t opal_pci_poll2(uint64_t id, uint64_t data);
 
 int64_t opal_int_get_xirr(uint32_t *out_xirr, bool just_poll);
+int64_t opal_rm_int_get_xirr(__be32 *out_xirr, bool just_poll);
 int64_t opal_int_set_cppr(uint8_t cppr);
 int64_t opal_int_eoi(uint32_t xirr);
+int64_t opal_rm_int_eoi(uint32_t xirr);
 int64_t opal_int_set_mfrr(uint32_t cpu, uint8_t mfrr);
+int64_t opal_rm_int_set_mfrr(uint32_t cpu, uint8_t mfrr);
 int64_t opal_pci_tce_kill(uint64_t phb_id, uint32_t kill_type,
                          uint32_t pe_num, uint32_t tce_size,
                          uint64_t dma_addr, uint32_t npages);
index 034a588b122c53ebf5bcb67b93f5cb1440f660c7..0bcc75e295e317fbcb64e495e62e9b010c8f804c 100644 (file)
@@ -308,4 +308,9 @@ static inline long plapr_set_watchpoint0(unsigned long dawr0, unsigned long dawr
        return plpar_set_mode(0, H_SET_MODE_RESOURCE_SET_DAWR, dawr0, dawrx0);
 }
 
+static inline long plapr_signal_sys_reset(long cpu)
+{
+       return plpar_hcall_norets(H_SIGNAL_SYS_RESET, cpu);
+}
+
 #endif /* _ASM_POWERPC_PLPAR_WRAPPERS_H */
index 0f73de069f19894993dbeb213cdff391afc0c844..726288048652fda01f11099b86f78d28a2096333 100644 (file)
@@ -53,7 +53,7 @@ void eeh_addr_cache_rmv_dev(struct pci_dev *dev);
 struct eeh_dev *eeh_addr_cache_get_dev(unsigned long addr);
 void eeh_slot_error_detail(struct eeh_pe *pe, int severity);
 int eeh_pci_enable(struct eeh_pe *pe, int function);
-int eeh_reset_pe(struct eeh_pe *);
+int eeh_pe_reset_full(struct eeh_pe *pe);
 void eeh_save_bars(struct eeh_dev *edev);
 int rtas_write_config(struct pci_dn *, int where, int size, u32 val);
 int rtas_read_config(struct pci_dn *, int where, int size, u32 *val);
index 7f436ba1b56f5d2246fed43825c41a2581fb7662..5e57705b47599973d19629a014035a7c1fb5a11d 100644 (file)
@@ -159,11 +159,5 @@ struct of_drconf_cell {
 /* Option Vector 6: IBM PAPR hints */
 #define OV6_LINUX              0x02    /* Linux is our OS */
 
-/*
- * The architecture vector has an array of PVR mask/value pairs,
- * followed by # option vectors - 1, followed by the option vectors.
- */
-extern unsigned char ibm_architecture_vec[];
-
 #endif /* __KERNEL__ */
 #endif /* _POWERPC_PROM_H */
index c491cfebfc050faf157da7dcbaf6d76ad3236909..0d4531aa2052d77fb8036b05f3550c33cfdde49e 100644 (file)
 #define PSSCR_EC               0x00100000 /* Exit Criterion */
 #define PSSCR_ESL              0x00200000 /* Enable State Loss */
 #define PSSCR_SD               0x00400000 /* Status Disable */
+#define PSSCR_PLS      0xf000000000000000 /* Power-saving Level Status */
+#define PSSCR_GUEST_VIS        0xf0000000000003ff /* Guest-visible PSSCR fields */
 
 /* Floating Point Status and Control Register (FPSCR) Fields */
 #define FPSCR_FX       0x80000000      /* FPU exception summary */
 #define SPRN_TEXASRU   0x83    /* ''      ''      ''    Upper 32  */
 #define   TEXASR_FS    __MASK(63-36) /* TEXASR Failure Summary */
 #define SPRN_TFHAR     0x80    /* Transaction Failure Handler Addr */
+#define SPRN_TIDR      144     /* Thread ID register */
 #define SPRN_CTRLF     0x088
 #define SPRN_CTRLT     0x098
 #define   CTRL_CT      0xc0000000      /* current thread */
 #define SPRN_HRMOR     0x139   /* Real mode offset register */
 #define SPRN_HSRR0     0x13A   /* Hypervisor Save/Restore 0 */
 #define SPRN_HSRR1     0x13B   /* Hypervisor Save/Restore 1 */
+#define SPRN_ASDR      0x330   /* Access segment descriptor register */
 #define SPRN_IC                0x350   /* Virtual Instruction Count */
 #define SPRN_VTB       0x351   /* Virtual Time Base */
 #define SPRN_LDBAR     0x352   /* LD Base Address Register */
 #define SPRN_PMCR      0x374   /* Power Management Control Register */
 
 /* HFSCR and FSCR bit numbers are the same */
+#define FSCR_MSGP_LG   10      /* Enable MSGP */
 #define FSCR_TAR_LG    8       /* Enable Target Address Register */
 #define FSCR_EBB_LG    7       /* Enable Event Based Branching */
 #define FSCR_TM_LG     5       /* Enable Transactional Memory */
 #define   FSCR_EBB     __MASK(FSCR_EBB_LG)
 #define   FSCR_DSCR    __MASK(FSCR_DSCR_LG)
 #define SPRN_HFSCR     0xbe    /* HV=1 Facility Status & Control Register */
+#define   HFSCR_MSGP   __MASK(FSCR_MSGP_LG)
 #define   HFSCR_TAR    __MASK(FSCR_TAR_LG)
 #define   HFSCR_EBB    __MASK(FSCR_EBB_LG)
 #define   HFSCR_TM     __MASK(FSCR_TM_LG)
 #define     LPCR_PECE0         ASM_CONST(0x0000000000004000)   /* ext. exceptions can cause exit */
 #define     LPCR_PECE1         ASM_CONST(0x0000000000002000)   /* decrementer can cause exit */
 #define     LPCR_PECE2         ASM_CONST(0x0000000000001000)   /* machine check etc can cause exit */
+#define     LPCR_PECE_HVEE     ASM_CONST(0x0000400000000000)   /* P9 Wakeup on HV interrupts */
 #define   LPCR_MER             ASM_CONST(0x0000000000000800)   /* Mediated External Exception */
 #define   LPCR_MER_SH          11
+#define          LPCR_GTSE             ASM_CONST(0x0000000000000400)   /* Guest Translation Shootdown Enable */
 #define   LPCR_TC              ASM_CONST(0x0000000000000200)   /* Translation control */
 #define   LPCR_LPES            0x0000000c
 #define   LPCR_LPES0           ASM_CONST(0x0000000000000008)      /* LPAR Env selector 0 */
 #define   PCR_VEC_DIS  (1ul << (63-0)) /* Vec. disable (bit NA since POWER8) */
 #define   PCR_VSX_DIS  (1ul << (63-1)) /* VSX disable (bit NA since POWER8) */
 #define   PCR_TM_DIS   (1ul << (63-2)) /* Trans. memory disable (POWER8) */
+/*
+ * These bits are used in the function kvmppc_set_arch_compat() to specify and
+ * determine both the compatibility level which we want to emulate and the
+ * compatibility level which the host is capable of emulating.
+ */
+#define   PCR_ARCH_207 0x8             /* Architecture 2.07 */
 #define   PCR_ARCH_206 0x4             /* Architecture 2.06 */
 #define   PCR_ARCH_205 0x2             /* Architecture 2.05 */
 #define        SPRN_HEIR       0x153   /* Hypervisor Emulated Instruction Register */
 #define PVR_ARCH_206   0x0f000003
 #define PVR_ARCH_206p  0x0f100003
 #define PVR_ARCH_207   0x0f000004
+#define PVR_ARCH_300   0x0f000005
 
 /* Macros for setting and retrieving special purpose registers */
 #ifndef __ASSEMBLY__
index 0d02c11dc331016ee6de817cb0cf127477335920..32db16d2e7ad0c0149ab44ee78200c0103387589 100644 (file)
@@ -176,7 +176,7 @@ static inline void set_hard_smp_processor_id(int cpu, int phys)
 #endif /* !CONFIG_SMP */
 #endif /* !CONFIG_PPC64 */
 
-#if defined(CONFIG_PPC64) && (defined(CONFIG_SMP) || defined(CONFIG_KEXEC))
+#if defined(CONFIG_PPC64) && (defined(CONFIG_SMP) || defined(CONFIG_KEXEC_CORE))
 extern void smp_release_cpus(void);
 #else
 static inline void smp_release_cpus(void) { };
diff --git a/arch/powerpc/include/asm/stackprotector.h b/arch/powerpc/include/asm/stackprotector.h
new file mode 100644 (file)
index 0000000..6720190
--- /dev/null
@@ -0,0 +1,40 @@
+/*
+ * GCC stack protector support.
+ *
+ * Stack protector works by putting predefined pattern at the start of
+ * the stack frame and verifying that it hasn't been overwritten when
+ * returning from the function.  The pattern is called stack canary
+ * and gcc expects it to be defined by a global variable called
+ * "__stack_chk_guard" on PPC.  This unfortunately means that on SMP
+ * we cannot have a different canary value per task.
+ */
+
+#ifndef _ASM_STACKPROTECTOR_H
+#define _ASM_STACKPROTECTOR_H
+
+#include <linux/random.h>
+#include <linux/version.h>
+#include <asm/reg.h>
+
+extern unsigned long __stack_chk_guard;
+
+/*
+ * Initialize the stackprotector canary value.
+ *
+ * NOTE: this must only be called from functions that never return,
+ * and it must always be inlined.
+ */
+static __always_inline void boot_init_stack_canary(void)
+{
+       unsigned long canary;
+
+       /* Try to get a semi random initial value. */
+       get_random_bytes(&canary, sizeof(canary));
+       canary ^= mftb();
+       canary ^= LINUX_VERSION_CODE;
+
+       current->stack_canary = canary;
+       __stack_chk_guard = current->stack_canary;
+}
+
+#endif /* _ASM_STACKPROTECTOR_H */
index 2fc5d4db503ccd03dfb40923267a2b763500cd22..4b369d83fe9ce1ea72b3f2a93590fb132d534512 100644 (file)
@@ -386,3 +386,4 @@ SYSCALL(mlock2)
 SYSCALL(copy_file_range)
 COMPAT_SYS_SPU(preadv2)
 COMPAT_SYS_SPU(pwritev2)
+SYSCALL(kexec_file_load)
index e8cdfec8d5125c531c45b7ffd250955ad68021af..eb1acee91a2034c30d4277fe040cd797279f13b4 100644 (file)
@@ -12,7 +12,7 @@
 #include <uapi/asm/unistd.h>
 
 
-#define NR_syscalls            382
+#define NR_syscalls            383
 
 #define __NR__exit __NR_exit
 
index e9f5f41aa55a1bc206749d56e75bd8edbb1d4068..2f26335a3c42a8141d29156f07105ca82761a98c 100644 (file)
 #define __NR_copy_file_range   379
 #define __NR_preadv2           380
 #define __NR_pwritev2          381
+#define __NR_kexec_file_load   382
 
 #endif /* _UAPI_ASM_POWERPC_UNISTD_H_ */
index 1925341dbb9c9df7ceb05975cf95c0ff8a3339a5..a3a6047fd39502b389d5854f203426b6e79456c1 100644 (file)
@@ -19,6 +19,10 @@ CFLAGS_init.o += $(DISABLE_LATENT_ENTROPY_PLUGIN)
 CFLAGS_btext.o += $(DISABLE_LATENT_ENTROPY_PLUGIN)
 CFLAGS_prom.o += $(DISABLE_LATENT_ENTROPY_PLUGIN)
 
+# -fstack-protector triggers protection checks in this code,
+# but it is being used too early to link to meaningful stack_chk logic.
+CFLAGS_prom_init.o += $(call cc-option, -fno-stack-protector)
+
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace early boot code
 CFLAGS_REMOVE_cputable.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
@@ -58,8 +62,6 @@ obj-$(CONFIG_PPC_RTAS)                += rtas.o rtas-rtc.o $(rtaspci-y-y)
 obj-$(CONFIG_PPC_RTAS_DAEMON)  += rtasd.o
 obj-$(CONFIG_RTAS_FLASH)       += rtas_flash.o
 obj-$(CONFIG_RTAS_PROC)                += rtas-proc.o
-obj-$(CONFIG_IBMVIO)           += vio.o
-obj-$(CONFIG_IBMEBUS)           += ibmebus.o
 obj-$(CONFIG_EEH)              += eeh.o eeh_pe.o eeh_dev.o eeh_cache.o \
                                  eeh_driver.o eeh_event.o eeh_sysfs.o
 obj-$(CONFIG_GENERIC_TBSYNC)   += smp-tbsync.o
@@ -107,8 +109,9 @@ pci64-$(CONFIG_PPC64)               += pci_dn.o pci-hotplug.o isa-bridge.o
 obj-$(CONFIG_PCI)              += pci_$(BITS).o $(pci64-y) \
                                   pci-common.o pci_of_scan.o
 obj-$(CONFIG_PCI_MSI)          += msi.o
-obj-$(CONFIG_KEXEC)            += machine_kexec.o crash.o \
+obj-$(CONFIG_KEXEC_CORE)       += machine_kexec.o crash.o \
                                   machine_kexec_$(BITS).o
+obj-$(CONFIG_KEXEC_FILE)       += machine_kexec_file_$(BITS).o kexec_elf_$(BITS).o
 obj-$(CONFIG_AUDIT)            += audit.o
 obj64-$(CONFIG_AUDIT)          += compat_audit.o
 
@@ -128,7 +131,7 @@ obj64-$(CONFIG_PPC_TRANSACTIONAL_MEM)       += tm.o
 obj-$(CONFIG_PPC64)            += $(obj64-y)
 obj-$(CONFIG_PPC32)            += $(obj32-y)
 
-ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC),)
+ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC_CORE),)
 obj-y                          += ppc_save_regs.o
 endif
 
index caec7bf3b99aebc22f5ed14b468e376b571cd9b6..5c860302efecd926bdc757b1c3b0a1cd55254679 100644 (file)
@@ -91,6 +91,9 @@ int main(void)
        DEFINE(TI_livepatch_sp, offsetof(struct thread_info, livepatch_sp));
 #endif
 
+#ifdef CONFIG_CC_STACKPROTECTOR
+       DEFINE(TSK_STACK_CANARY, offsetof(struct task_struct, stack_canary));
+#endif
        DEFINE(KSP, offsetof(struct thread_struct, ksp));
        DEFINE(PT_REGS, offsetof(struct thread_struct, regs));
 #ifdef CONFIG_BOOKE
index 52ff3f025437947484d7567141b5d3802cc887c2..fe35ef2efc280c8deaf83ac8197139e424770950 100644 (file)
@@ -96,6 +96,7 @@ _GLOBAL(__setup_cpu_power9)
        mtlr    r11
        beqlr
        li      r0,0
+       mtspr   SPRN_PSSCR,r0
        mtspr   SPRN_LPID,r0
        mfspr   r3,SPRN_LPCR
        ori     r3, r3, LPCR_PECEDH
@@ -116,6 +117,7 @@ _GLOBAL(__restore_cpu_power9)
        mtlr    r11
        beqlr
        li      r0,0
+       mtspr   SPRN_PSSCR,r0
        mtspr   SPRN_LPID,r0
        mfspr   r3,SPRN_LPCR
        ori     r3, r3, LPCR_PECEDH
@@ -174,7 +176,7 @@ __init_FSCR:
 __init_HFSCR:
        mfspr   r3,SPRN_HFSCR
        ori     r3,r3,HFSCR_TAR|HFSCR_TM|HFSCR_BHRB|HFSCR_PM|\
-                     HFSCR_DSCR|HFSCR_VECVSX|HFSCR_FP|HFSCR_EBB
+                     HFSCR_DSCR|HFSCR_VECVSX|HFSCR_FP|HFSCR_EBB|HFSCR_MSGP
        mtspr   SPRN_HFSCR,r3
        blr
 
index f25731627d7f472a44e25bf48e61cffcc2c51055..8180bfd7ab931c5b6d16a1c1b7cb73689206571b 100644 (file)
@@ -372,7 +372,7 @@ static int eeh_phb_check_failure(struct eeh_pe *pe)
        /* Find the PHB PE */
        phb_pe = eeh_phb_pe_get(pe->phb);
        if (!phb_pe) {
-               pr_warn("%s Can't find PE for PHB#%d\n",
+               pr_warn("%s Can't find PE for PHB#%x\n",
                        __func__, pe->phb->global_number);
                return -EEXIST;
        }
@@ -664,7 +664,7 @@ int eeh_pci_enable(struct eeh_pe *pe, int function)
        rc = eeh_ops->set_option(pe, function);
        if (rc)
                pr_warn("%s: Unexpected state change %d on "
-                       "PHB#%d-PE#%x, err=%d\n",
+                       "PHB#%x-PE#%x, err=%d\n",
                        __func__, function, pe->phb->global_number,
                        pe->addr, rc);
 
@@ -808,76 +808,67 @@ static void *eeh_set_dev_freset(void *data, void *flag)
 }
 
 /**
- * eeh_reset_pe_once - Assert the pci #RST line for 1/4 second
+ * eeh_pe_reset_full - Complete a full reset process on the indicated PE
  * @pe: EEH PE
  *
- * Assert the PCI #RST line for 1/4 second.
+ * This function executes a full reset procedure on a PE, including setting
+ * the appropriate flags, performing a fundamental or hot reset, and then
+ * deactivating the reset status.  It is designed to be used within the EEH
+ * subsystem, as opposed to eeh_pe_reset which is exported to drivers and
+ * only performs a single operation at a time.
+ *
+ * This function will attempt to reset a PE three times before failing.
  */
-static void eeh_reset_pe_once(struct eeh_pe *pe)
+int eeh_pe_reset_full(struct eeh_pe *pe)
 {
+       int active_flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
+       int reset_state = (EEH_PE_RESET | EEH_PE_CFG_BLOCKED);
+       int type = EEH_RESET_HOT;
        unsigned int freset = 0;
+       int i, state, ret;
 
-       /* Determine type of EEH reset required for
-        * Partitionable Endpoint, a hot-reset (1)
-        * or a fundamental reset (3).
-        * A fundamental reset required by any device under
-        * Partitionable Endpoint trumps hot-reset.
+       /*
+        * Determine the type of reset to perform - hot or fundamental.
+        * Hot reset is the default operation, unless any device under the
+        * PE requires a fundamental reset.
         */
        eeh_pe_dev_traverse(pe, eeh_set_dev_freset, &freset);
 
        if (freset)
-               eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL);
-       else
-               eeh_ops->reset(pe, EEH_RESET_HOT);
-
-       eeh_ops->reset(pe, EEH_RESET_DEACTIVATE);
-}
-
-/**
- * eeh_reset_pe - Reset the indicated PE
- * @pe: EEH PE
- *
- * This routine should be called to reset indicated device, including
- * PE. A PE might include multiple PCI devices and sometimes PCI bridges
- * might be involved as well.
- */
-int eeh_reset_pe(struct eeh_pe *pe)
-{
-       int flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
-       int i, state, ret;
+               type = EEH_RESET_FUNDAMENTAL;
 
-       /* Mark as reset and block config space */
-       eeh_pe_state_mark(pe, EEH_PE_RESET | EEH_PE_CFG_BLOCKED);
+       /* Mark the PE as in reset state and block config space accesses */
+       eeh_pe_state_mark(pe, reset_state);
 
-       /* Take three shots at resetting the bus */
+       /* Make three attempts at resetting the bus */
        for (i = 0; i < 3; i++) {
-               eeh_reset_pe_once(pe);
+               ret = eeh_pe_reset(pe, type);
+               if (ret)
+                       break;
 
-               /*
-                * EEH_PE_ISOLATED is expected to be removed after
-                * BAR restore.
-                */
+               ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE);
+               if (ret)
+                       break;
+
+               /* Wait until the PE is in a functioning state */
                state = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC);
-               if ((state & flags) == flags) {
-                       ret = 0;
-                       goto out;
-               }
+               if ((state & active_flags) == active_flags)
+                       break;
 
                if (state < 0) {
-                       pr_warn("%s: Unrecoverable slot failure on PHB#%d-PE#%x",
+                       pr_warn("%s: Unrecoverable slot failure on PHB#%x-PE#%x",
                                __func__, pe->phb->global_number, pe->addr);
                        ret = -ENOTRECOVERABLE;
-                       goto out;
+                       break;
                }
 
-               /* We might run out of credits */
+               /* Set error in case this is our last attempt */
                ret = -EIO;
                pr_warn("%s: Failure %d resetting PHB#%x-PE#%x\n (%d)\n",
                        __func__, state, pe->phb->global_number, pe->addr, (i + 1));
        }
 
-out:
-       eeh_pe_state_clear(pe, EEH_PE_RESET | EEH_PE_CFG_BLOCKED);
+       eeh_pe_state_clear(pe, reset_state);
        return ret;
 }
 
@@ -1601,6 +1592,7 @@ static int eeh_pe_reenable_devices(struct eeh_pe *pe)
        return eeh_unfreeze_pe(pe, true);
 }
 
+
 /**
  * eeh_pe_reset - Issue PE reset according to specified type
  * @pe: EEH PE
index a62be72da274de3993a4b28cfbad7938d5502559..555a47bd5d1a726014742b48b292fecabc8ef5d1 100644 (file)
@@ -588,7 +588,7 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe)
        eeh_pe_dev_traverse(pe, eeh_dev_save_state, NULL);
 
        /* Issue reset */
-       ret = eeh_reset_pe(pe);
+       ret = eeh_pe_reset_full(pe);
        if (ret) {
                eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
                return ret;
@@ -659,7 +659,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
         * config accesses. So we prefer to block them. However, controlled
         * PCI config accesses initiated from EEH itself are allowed.
         */
-       rc = eeh_reset_pe(pe);
+       rc = eeh_pe_reset_full(pe);
        if (rc)
                return rc;
 
@@ -732,7 +732,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
 
        frozen_bus = eeh_pe_bus_get(pe);
        if (!frozen_bus) {
-               pr_err("%s: Cannot find PCI bus for PHB#%d-PE#%x\n",
+               pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n",
                        __func__, pe->phb->global_number, pe->addr);
                return;
        }
@@ -876,7 +876,7 @@ excess_failures:
         * are due to poorly seated PCI cards. Only 10% or so are
         * due to actual, failed cards.
         */
-       pr_err("EEH: PHB#%d-PE#%x has failed %d times in the\n"
+       pr_err("EEH: PHB#%x-PE#%x has failed %d times in the\n"
               "last hour and has been permanently disabled.\n"
               "Please try reseating or replacing it.\n",
                pe->phb->global_number, pe->addr,
@@ -884,7 +884,7 @@ excess_failures:
        goto perm_error;
 
 hard_fail:
-       pr_err("EEH: Unable to recover from failure from PHB#%d-PE#%x.\n"
+       pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n"
               "Please try reseating or replacing it\n",
                pe->phb->global_number, pe->addr);
 
@@ -998,7 +998,7 @@ static void eeh_handle_special_event(void)
                                bus = eeh_pe_bus_get(phb_pe);
                                if (!bus) {
                                        pr_err("%s: Cannot find PCI bus for "
-                                              "PHB#%d-PE#%x\n",
+                                              "PHB#%x-PE#%x\n",
                                               __func__,
                                               pe->phb->global_number,
                                               pe->addr);
index 82e7327e3cd0ec7cec0e0655dd75ce1b55465968..accbf8b5fd46f4a701e69ce4292a8dc03cf0dc0d 100644 (file)
@@ -75,11 +75,11 @@ static int eeh_event_handler(void * dummy)
                if (pe) {
                        eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
                        if (pe->type & EEH_PE_PHB)
-                               pr_info("EEH: Detected error on PHB#%d\n",
+                               pr_info("EEH: Detected error on PHB#%x\n",
                                         pe->phb->global_number);
                        else
                                pr_info("EEH: Detected PCI bus error on "
-                                       "PHB#%d-PE#%x\n",
+                                       "PHB#%x-PE#%x\n",
                                        pe->phb->global_number, pe->addr);
                        eeh_handle_event(pe);
                        eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
index de7d091c4c31d4a76429ff0d58f20c73d31464e0..cc4b206f77e422872e8e0691d2cc90900509d91e 100644 (file)
@@ -104,7 +104,7 @@ int eeh_phb_pe_create(struct pci_controller *phb)
        /* Put it into the list */
        list_add_tail(&pe->child, &eeh_phb_pe);
 
-       pr_debug("EEH: Add PE for PHB#%d\n", phb->global_number);
+       pr_debug("EEH: Add PE for PHB#%x\n", phb->global_number);
 
        return 0;
 }
@@ -333,7 +333,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 
        /* Check if the PE number is valid */
        if (!eeh_has_flag(EEH_VALID_PE_ZERO) && !edev->pe_config_addr) {
-               pr_err("%s: Invalid PE#0 for edev 0x%x on PHB#%d\n",
+               pr_err("%s: Invalid PE#0 for edev 0x%x on PHB#%x\n",
                       __func__, edev->config_addr, edev->phb->global_number);
                return -EINVAL;
        }
index 3841d749a430069f4d4f2705c4199c08609b3757..5742dbdbee4677924ebf0019b891e43879410131 100644 (file)
@@ -674,7 +674,11 @@ BEGIN_FTR_SECTION
        mtspr   SPRN_SPEFSCR,r0         /* restore SPEFSCR reg */
 END_FTR_SECTION_IFSET(CPU_FTR_SPE)
 #endif /* CONFIG_SPE */
-
+#if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP)
+       lwz     r0,TSK_STACK_CANARY(r2)
+       lis     r4,__stack_chk_guard@ha
+       stw     r0,__stack_chk_guard@l(r4)
+#endif
        lwz     r0,_CCR(r1)
        mtcrf   0xFF,r0
        /* r3-r12 are destroyed -- Cort */
index 38a1f96430e10499dd465c98f426a7d970a38271..45b453e4d0c87bf62c9100461bc33fb165f29931 100644 (file)
@@ -923,10 +923,10 @@ kernel_dbg_exc:
                                PROLOG_ADDITION_NONE)
        EXCEPTION_COMMON(0x340)
        addi    r3,r1,STACK_FRAME_OVERHEAD
-       bl      .save_nvgprs
+       bl      save_nvgprs
        INTS_RESTORE_HARD
-       bl      .unknown_exception
-       b       .ret_from_except
+       bl      unknown_exception
+       b       ret_from_except
 
 /*
  * An interrupt came in while soft-disabled; We mark paca->irq_happened
index a95639b8d4ac5d72a7be17d2b1e20e3f5fa874a1..5c9f50c1aa992867c57cad33983ccac0460ef2f6 100644 (file)
@@ -47,13 +47,11 @@ ftrace_modify_code(unsigned long ip, unsigned int old, unsigned int new)
        unsigned int replaced;
 
        /*
-        * Note: Due to modules and __init, code can
-        *  disappear and change, we need to protect against faulting
-        *  as well as code changing. We do this by using the
-        *  probe_kernel_* functions.
-        *
-        * No real locking needed, this code is run through
-        * kstop_machine, or before SMP starts.
+        * Note:
+        * We are paranoid about modifying text, as if a bug was to happen, it
+        * could cause us to read or write to someplace that could cause harm.
+        * Carefully read and modify the code with probe_kernel_*(), and make
+        * sure what we read is what we expected it to be before modifying it.
         */
 
        /* read the text we want to modify */
index 451a8e1cf57b649df84561798279e3f3749a7a19..1dc5eae2ced3ad011bfa094c2e66b5a099ddf853 100644 (file)
@@ -160,7 +160,7 @@ __secondary_hold:
        cmpdi   0,r12,0
        beq     100b
 
-#if defined(CONFIG_SMP) || defined(CONFIG_KEXEC)
+#if defined(CONFIG_SMP) || defined(CONFIG_KEXEC_CORE)
 #ifdef CONFIG_PPC_BOOK3E
        tovirt(r12,r12)
 #endif
@@ -221,9 +221,9 @@ booting_thread_hwid:
  */
 _GLOBAL(book3e_start_thread)
        LOAD_REG_IMMEDIATE(r5, MSR_KERNEL)
-       cmpi    0, r3, 0
+       cmpwi   r3, 0
        beq     10f
-       cmpi    0, r3, 1
+       cmpwi   r3, 1
        beq     11f
        /* If the thread id is invalid, just exit. */
        b       13f
@@ -248,9 +248,9 @@ _GLOBAL(book3e_start_thread)
  * r3 = the thread physical id
  */
 _GLOBAL(book3e_stop_thread)
-       cmpi    0, r3, 0
+       cmpwi   r3, 0
        beq     10f
-       cmpi    0, r3, 1
+       cmpwi   r3, 1
        beq     10f
        /* If the thread id is invalid, just exit. */
        b       13f
diff --git a/arch/powerpc/kernel/ibmebus.c b/arch/powerpc/kernel/ibmebus.c
deleted file mode 100644 (file)
index 35f5244..0000000
+++ /dev/null
@@ -1,767 +0,0 @@
-/*
- * IBM PowerPC IBM eBus Infrastructure Support.
- *
- * Copyright (c) 2005 IBM Corporation
- *  Joachim Fenkes <fenkes@de.ibm.com>
- *  Heiko J Schick <schickhj@de.ibm.com>
- *
- * All rights reserved.
- *
- * This source code is distributed under a dual license of GPL v2.0 and OpenIB
- * BSD.
- *
- * OpenIB BSD License
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials
- * provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/init.h>
-#include <linux/export.h>
-#include <linux/console.h>
-#include <linux/kobject.h>
-#include <linux/dma-mapping.h>
-#include <linux/interrupt.h>
-#include <linux/of.h>
-#include <linux/slab.h>
-#include <linux/stat.h>
-#include <linux/of_platform.h>
-#include <asm/ibmebus.h>
-
-static struct device ibmebus_bus_device = { /* fake "parent" device */
-       .init_name = "ibmebus",
-};
-
-struct bus_type ibmebus_bus_type;
-
-/* These devices will automatically be added to the bus during init */
-static const struct of_device_id ibmebus_matches[] __initconst = {
-       { .compatible = "IBM,lhca" },
-       { .compatible = "IBM,lhea" },
-       {},
-};
-
-static void *ibmebus_alloc_coherent(struct device *dev,
-                                   size_t size,
-                                   dma_addr_t *dma_handle,
-                                   gfp_t flag,
-                                   unsigned long attrs)
-{
-       void *mem;
-
-       mem = kmalloc(size, flag);
-       *dma_handle = (dma_addr_t)mem;
-
-       return mem;
-}
-
-static void ibmebus_free_coherent(struct device *dev,
-                                 size_t size, void *vaddr,
-                                 dma_addr_t dma_handle,
-                                 unsigned long attrs)
-{
-       kfree(vaddr);
-}
-
-static dma_addr_t ibmebus_map_page(struct device *dev,
-                                  struct page *page,
-                                  unsigned long offset,
-                                  size_t size,
-                                  enum dma_data_direction direction,
-                                  unsigned long attrs)
-{
-       return (dma_addr_t)(page_address(page) + offset);
-}
-
-static void ibmebus_unmap_page(struct device *dev,
-                              dma_addr_t dma_addr,
-                              size_t size,
-                              enum dma_data_direction direction,
-                              unsigned long attrs)
-{
-       return;
-}
-
-static int ibmebus_map_sg(struct device *dev,
-                         struct scatterlist *sgl,
-                         int nents, enum dma_data_direction direction,
-                         unsigned long attrs)
-{
-       struct scatterlist *sg;
-       int i;
-
-       for_each_sg(sgl, sg, nents, i) {
-               sg->dma_address = (dma_addr_t) sg_virt(sg);
-               sg->dma_length = sg->length;
-       }
-
-       return nents;
-}
-
-static void ibmebus_unmap_sg(struct device *dev,
-                            struct scatterlist *sg,
-                            int nents, enum dma_data_direction direction,
-                            unsigned long attrs)
-{
-       return;
-}
-
-static int ibmebus_dma_supported(struct device *dev, u64 mask)
-{
-       return mask == DMA_BIT_MASK(64);
-}
-
-static u64 ibmebus_dma_get_required_mask(struct device *dev)
-{
-       return DMA_BIT_MASK(64);
-}
-
-static struct dma_map_ops ibmebus_dma_ops = {
-       .alloc              = ibmebus_alloc_coherent,
-       .free               = ibmebus_free_coherent,
-       .map_sg             = ibmebus_map_sg,
-       .unmap_sg           = ibmebus_unmap_sg,
-       .dma_supported      = ibmebus_dma_supported,
-       .get_required_mask  = ibmebus_dma_get_required_mask,
-       .map_page           = ibmebus_map_page,
-       .unmap_page         = ibmebus_unmap_page,
-};
-
-static int ibmebus_match_path(struct device *dev, void *data)
-{
-       struct device_node *dn = to_platform_device(dev)->dev.of_node;
-       return (dn->full_name &&
-               (strcasecmp((char *)data, dn->full_name) == 0));
-}
-
-static int ibmebus_match_node(struct device *dev, void *data)
-{
-       return to_platform_device(dev)->dev.of_node == data;
-}
-
-static int ibmebus_create_device(struct device_node *dn)
-{
-       struct platform_device *dev;
-       int ret;
-
-       dev = of_device_alloc(dn, NULL, &ibmebus_bus_device);
-       if (!dev)
-               return -ENOMEM;
-
-       dev->dev.bus = &ibmebus_bus_type;
-       dev->dev.archdata.dma_ops = &ibmebus_dma_ops;
-
-       ret = of_device_add(dev);
-       if (ret)
-               platform_device_put(dev);
-       return ret;
-}
-
-static int ibmebus_create_devices(const struct of_device_id *matches)
-{
-       struct device_node *root, *child;
-       struct device *dev;
-       int ret = 0;
-
-       root = of_find_node_by_path("/");
-
-       for_each_child_of_node(root, child) {
-               if (!of_match_node(matches, child))
-                       continue;
-
-               dev = bus_find_device(&ibmebus_bus_type, NULL, child,
-                                     ibmebus_match_node);
-               if (dev) {
-                       put_device(dev);
-                       continue;
-               }
-
-               ret = ibmebus_create_device(child);
-               if (ret) {
-                       printk(KERN_ERR "%s: failed to create device (%i)",
-                              __func__, ret);
-                       of_node_put(child);
-                       break;
-               }
-       }
-
-       of_node_put(root);
-       return ret;
-}
-
-int ibmebus_register_driver(struct platform_driver *drv)
-{
-       /* If the driver uses devices that ibmebus doesn't know, add them */
-       ibmebus_create_devices(drv->driver.of_match_table);
-
-       drv->driver.bus = &ibmebus_bus_type;
-       return driver_register(&drv->driver);
-}
-EXPORT_SYMBOL(ibmebus_register_driver);
-
-void ibmebus_unregister_driver(struct platform_driver *drv)
-{
-       driver_unregister(&drv->driver);
-}
-EXPORT_SYMBOL(ibmebus_unregister_driver);
-
-int ibmebus_request_irq(u32 ist, irq_handler_t handler,
-                       unsigned long irq_flags, const char *devname,
-                       void *dev_id)
-{
-       unsigned int irq = irq_create_mapping(NULL, ist);
-
-       if (!irq)
-               return -EINVAL;
-
-       return request_irq(irq, handler, irq_flags, devname, dev_id);
-}
-EXPORT_SYMBOL(ibmebus_request_irq);
-
-void ibmebus_free_irq(u32 ist, void *dev_id)
-{
-       unsigned int irq = irq_find_mapping(NULL, ist);
-
-       free_irq(irq, dev_id);
-       irq_dispose_mapping(irq);
-}
-EXPORT_SYMBOL(ibmebus_free_irq);
-
-static char *ibmebus_chomp(const char *in, size_t count)
-{
-       char *out = kmalloc(count + 1, GFP_KERNEL);
-
-       if (!out)
-               return NULL;
-
-       memcpy(out, in, count);
-       out[count] = '\0';
-       if (out[count - 1] == '\n')
-               out[count - 1] = '\0';
-
-       return out;
-}
-
-static ssize_t ibmebus_store_probe(struct bus_type *bus,
-                                  const char *buf, size_t count)
-{
-       struct device_node *dn = NULL;
-       struct device *dev;
-       char *path;
-       ssize_t rc = 0;
-
-       path = ibmebus_chomp(buf, count);
-       if (!path)
-               return -ENOMEM;
-
-       dev = bus_find_device(&ibmebus_bus_type, NULL, path,
-                             ibmebus_match_path);
-       if (dev) {
-               put_device(dev);
-               printk(KERN_WARNING "%s: %s has already been probed\n",
-                      __func__, path);
-               rc = -EEXIST;
-               goto out;
-       }
-
-       if ((dn = of_find_node_by_path(path))) {
-               rc = ibmebus_create_device(dn);
-               of_node_put(dn);
-       } else {
-               printk(KERN_WARNING "%s: no such device node: %s\n",
-                      __func__, path);
-               rc = -ENODEV;
-       }
-
-out:
-       kfree(path);
-       if (rc)
-               return rc;
-       return count;
-}
-static BUS_ATTR(probe, S_IWUSR, NULL, ibmebus_store_probe);
-
-static ssize_t ibmebus_store_remove(struct bus_type *bus,
-                                   const char *buf, size_t count)
-{
-       struct device *dev;
-       char *path;
-
-       path = ibmebus_chomp(buf, count);
-       if (!path)
-               return -ENOMEM;
-
-       if ((dev = bus_find_device(&ibmebus_bus_type, NULL, path,
-                                  ibmebus_match_path))) {
-               of_device_unregister(to_platform_device(dev));
-               put_device(dev);
-
-               kfree(path);
-               return count;
-       } else {
-               printk(KERN_WARNING "%s: %s not on the bus\n",
-                      __func__, path);
-
-               kfree(path);
-               return -ENODEV;
-       }
-}
-static BUS_ATTR(remove, S_IWUSR, NULL, ibmebus_store_remove);
-
-static struct attribute *ibmbus_bus_attrs[] = {
-       &bus_attr_probe.attr,
-       &bus_attr_remove.attr,
-       NULL,
-};
-ATTRIBUTE_GROUPS(ibmbus_bus);
-
-static int ibmebus_bus_bus_match(struct device *dev, struct device_driver *drv)
-{
-       const struct of_device_id *matches = drv->of_match_table;
-
-       if (!matches)
-               return 0;
-
-       return of_match_device(matches, dev) != NULL;
-}
-
-static int ibmebus_bus_device_probe(struct device *dev)
-{
-       int error = -ENODEV;
-       struct platform_driver *drv;
-       struct platform_device *of_dev;
-
-       drv = to_platform_driver(dev->driver);
-       of_dev = to_platform_device(dev);
-
-       if (!drv->probe)
-               return error;
-
-       of_dev_get(of_dev);
-
-       if (of_driver_match_device(dev, dev->driver))
-               error = drv->probe(of_dev);
-       if (error)
-               of_dev_put(of_dev);
-
-       return error;
-}
-
-static int ibmebus_bus_device_remove(struct device *dev)
-{
-       struct platform_device *of_dev = to_platform_device(dev);
-       struct platform_driver *drv = to_platform_driver(dev->driver);
-
-       if (dev->driver && drv->remove)
-               drv->remove(of_dev);
-       return 0;
-}
-
-static void ibmebus_bus_device_shutdown(struct device *dev)
-{
-       struct platform_device *of_dev = to_platform_device(dev);
-       struct platform_driver *drv = to_platform_driver(dev->driver);
-
-       if (dev->driver && drv->shutdown)
-               drv->shutdown(of_dev);
-}
-
-/*
- * ibmebus_bus_device_attrs
- */
-static ssize_t devspec_show(struct device *dev,
-                               struct device_attribute *attr, char *buf)
-{
-       struct platform_device *ofdev;
-
-       ofdev = to_platform_device(dev);
-       return sprintf(buf, "%s\n", ofdev->dev.of_node->full_name);
-}
-
-static ssize_t name_show(struct device *dev,
-                               struct device_attribute *attr, char *buf)
-{
-       struct platform_device *ofdev;
-
-       ofdev = to_platform_device(dev);
-       return sprintf(buf, "%s\n", ofdev->dev.of_node->name);
-}
-
-static ssize_t modalias_show(struct device *dev,
-                               struct device_attribute *attr, char *buf)
-{
-       ssize_t len = of_device_get_modalias(dev, buf, PAGE_SIZE - 2);
-       buf[len] = '\n';
-       buf[len+1] = 0;
-       return len+1;
-}
-
-static struct device_attribute ibmebus_bus_device_attrs[] = {
-       __ATTR_RO(devspec),
-       __ATTR_RO(name),
-       __ATTR_RO(modalias),
-       __ATTR_NULL
-};
-
-#ifdef CONFIG_PM_SLEEP
-static int ibmebus_bus_legacy_suspend(struct device *dev, pm_message_t mesg)
-{
-       struct platform_device *of_dev = to_platform_device(dev);
-       struct platform_driver *drv = to_platform_driver(dev->driver);
-       int ret = 0;
-
-       if (dev->driver && drv->suspend)
-               ret = drv->suspend(of_dev, mesg);
-       return ret;
-}
-
-static int ibmebus_bus_legacy_resume(struct device *dev)
-{
-       struct platform_device *of_dev = to_platform_device(dev);
-       struct platform_driver *drv = to_platform_driver(dev->driver);
-       int ret = 0;
-
-       if (dev->driver && drv->resume)
-               ret = drv->resume(of_dev);
-       return ret;
-}
-
-static int ibmebus_bus_pm_prepare(struct device *dev)
-{
-       struct device_driver *drv = dev->driver;
-       int ret = 0;
-
-       if (drv && drv->pm && drv->pm->prepare)
-               ret = drv->pm->prepare(dev);
-
-       return ret;
-}
-
-static void ibmebus_bus_pm_complete(struct device *dev)
-{
-       struct device_driver *drv = dev->driver;
-
-       if (drv && drv->pm && drv->pm->complete)
-               drv->pm->complete(dev);
-}
-
-#ifdef CONFIG_SUSPEND
-
-static int ibmebus_bus_pm_suspend(struct device *dev)
-{
-       struct device_driver *drv = dev->driver;
-       int ret = 0;
-
-       if (!drv)
-               return 0;
-
-       if (drv->pm) {
-               if (drv->pm->suspend)
-                       ret = drv->pm->suspend(dev);
-       } else {
-               ret = ibmebus_bus_legacy_suspend(dev, PMSG_SUSPEND);
-       }
-
-       return ret;
-}
-
-static int ibmebus_bus_pm_suspend_noirq(struct device *dev)
-{
-       struct device_driver *drv = dev->driver;
-       int ret = 0;
-
-       if (!drv)
-               return 0;
-
-       if (drv->pm) {
-               if (drv->pm->suspend_noirq)
-                       ret = drv->pm->suspend_noirq(dev);
-       }
-
-       return ret;
-}
-
-static int ibmebus_bus_pm_resume(struct device *dev)
-{
-       struct device_driver *drv = dev->driver;
-       int ret = 0;
-
-       if (!drv)
-               return 0;
-
-       if (drv->pm) {
-               if (drv->pm->resume)
-                       ret = drv->pm->resume(dev);
-       } else {
-               ret = ibmebus_bus_legacy_resume(dev);
-       }
-
-       return ret;
-}
-
-static int ibmebus_bus_pm_resume_noirq(struct device *dev)
-{
-       struct device_driver *drv = dev->driver;
-       int ret = 0;
-
-       if (!drv)
-               return 0;
-
-       if (drv->pm) {
-               if (drv->pm->resume_noirq)
-                       ret = drv->pm->resume_noirq(dev);
-       }
-
-       return ret;
-}
-
-#else /* !CONFIG_SUSPEND */
-
-#define ibmebus_bus_pm_suspend         NULL
-#define ibmebus_bus_pm_resume          NULL
-#define ibmebus_bus_pm_suspend_noirq   NULL
-#define ibmebus_bus_pm_resume_noirq    NULL
-
-#endif /* !CONFIG_SUSPEND */
-
-#ifdef CONFIG_HIBERNATE_CALLBACKS
-
-static int ibmebus_bus_pm_freeze(struct device *dev)
-{
-       struct device_driver *drv = dev->driver;
-       int ret = 0;
-
-       if (!drv)
-               return 0;
-
-       if (drv->pm) {
-               if (drv->pm->freeze)
-                       ret = drv->pm->freeze(dev);
-       } else {
-               ret = ibmebus_bus_legacy_suspend(dev, PMSG_FREEZE);
-       }
-
-       return ret;
-}
-
-static int ibmebus_bus_pm_freeze_noirq(struct device *dev)
-{
-       struct device_driver *drv = dev->driver;
-       int ret = 0;
-
-       if (!drv)
-               return 0;
-
-       if (drv->pm) {
-               if (drv->pm->freeze_noirq)
-                       ret = drv->pm->freeze_noirq(dev);
-       }
-
-       return ret;
-}
-
-static int ibmebus_bus_pm_thaw(struct device *dev)
-{
-       struct device_driver *drv = dev->driver;
-       int ret = 0;
-
-       if (!drv)
-               return 0;
-
-       if (drv->pm) {
-               if (drv->pm->thaw)
-                       ret = drv->pm->thaw(dev);
-       } else {
-               ret = ibmebus_bus_legacy_resume(dev);
-       }
-
-       return ret;
-}
-
-static int ibmebus_bus_pm_thaw_noirq(struct device *dev)
-{
-       struct device_driver *drv = dev->driver;
-       int ret = 0;
-
-       if (!drv)
-               return 0;
-
-       if (drv->pm) {
-               if (drv->pm->thaw_noirq)
-                       ret = drv->pm->thaw_noirq(dev);
-       }
-
-       return ret;
-}
-
-static int ibmebus_bus_pm_poweroff(struct device *dev)
-{
-       struct device_driver *drv = dev->driver;
-       int ret = 0;
-
-       if (!drv)
-               return 0;
-
-       if (drv->pm) {
-               if (drv->pm->poweroff)
-                       ret = drv->pm->poweroff(dev);
-       } else {
-               ret = ibmebus_bus_legacy_suspend(dev, PMSG_HIBERNATE);
-       }
-
-       return ret;
-}
-
-static int ibmebus_bus_pm_poweroff_noirq(struct device *dev)
-{
-       struct device_driver *drv = dev->driver;
-       int ret = 0;
-
-       if (!drv)
-               return 0;
-
-       if (drv->pm) {
-               if (drv->pm->poweroff_noirq)
-                       ret = drv->pm->poweroff_noirq(dev);
-       }
-
-       return ret;
-}
-
-static int ibmebus_bus_pm_restore(struct device *dev)
-{
-       struct device_driver *drv = dev->driver;
-       int ret = 0;
-
-       if (!drv)
-               return 0;
-
-       if (drv->pm) {
-               if (drv->pm->restore)
-                       ret = drv->pm->restore(dev);
-       } else {
-               ret = ibmebus_bus_legacy_resume(dev);
-       }
-
-       return ret;
-}
-
-static int ibmebus_bus_pm_restore_noirq(struct device *dev)
-{
-       struct device_driver *drv = dev->driver;
-       int ret = 0;
-
-       if (!drv)
-               return 0;
-
-       if (drv->pm) {
-               if (drv->pm->restore_noirq)
-                       ret = drv->pm->restore_noirq(dev);
-       }
-
-       return ret;
-}
-
-#else /* !CONFIG_HIBERNATE_CALLBACKS */
-
-#define ibmebus_bus_pm_freeze          NULL
-#define ibmebus_bus_pm_thaw            NULL
-#define ibmebus_bus_pm_poweroff                NULL
-#define ibmebus_bus_pm_restore         NULL
-#define ibmebus_bus_pm_freeze_noirq    NULL
-#define ibmebus_bus_pm_thaw_noirq              NULL
-#define ibmebus_bus_pm_poweroff_noirq  NULL
-#define ibmebus_bus_pm_restore_noirq   NULL
-
-#endif /* !CONFIG_HIBERNATE_CALLBACKS */
-
-static struct dev_pm_ops ibmebus_bus_dev_pm_ops = {
-       .prepare = ibmebus_bus_pm_prepare,
-       .complete = ibmebus_bus_pm_complete,
-       .suspend = ibmebus_bus_pm_suspend,
-       .resume = ibmebus_bus_pm_resume,
-       .freeze = ibmebus_bus_pm_freeze,
-       .thaw = ibmebus_bus_pm_thaw,
-       .poweroff = ibmebus_bus_pm_poweroff,
-       .restore = ibmebus_bus_pm_restore,
-       .suspend_noirq = ibmebus_bus_pm_suspend_noirq,
-       .resume_noirq = ibmebus_bus_pm_resume_noirq,
-       .freeze_noirq = ibmebus_bus_pm_freeze_noirq,
-       .thaw_noirq = ibmebus_bus_pm_thaw_noirq,
-       .poweroff_noirq = ibmebus_bus_pm_poweroff_noirq,
-       .restore_noirq = ibmebus_bus_pm_restore_noirq,
-};
-
-#define IBMEBUS_BUS_PM_OPS_PTR (&ibmebus_bus_dev_pm_ops)
-
-#else /* !CONFIG_PM_SLEEP */
-
-#define IBMEBUS_BUS_PM_OPS_PTR NULL
-
-#endif /* !CONFIG_PM_SLEEP */
-
-struct bus_type ibmebus_bus_type = {
-       .name      = "ibmebus",
-       .uevent    = of_device_uevent_modalias,
-       .bus_groups = ibmbus_bus_groups,
-       .match     = ibmebus_bus_bus_match,
-       .probe     = ibmebus_bus_device_probe,
-       .remove    = ibmebus_bus_device_remove,
-       .shutdown  = ibmebus_bus_device_shutdown,
-       .dev_attrs = ibmebus_bus_device_attrs,
-       .pm        = IBMEBUS_BUS_PM_OPS_PTR,
-};
-EXPORT_SYMBOL(ibmebus_bus_type);
-
-static int __init ibmebus_bus_init(void)
-{
-       int err;
-
-       printk(KERN_INFO "IBM eBus Device Driver\n");
-
-       err = bus_register(&ibmebus_bus_type);
-       if (err) {
-               printk(KERN_ERR "%s: failed to register IBM eBus.\n",
-                      __func__);
-               return err;
-       }
-
-       err = device_register(&ibmebus_bus_device);
-       if (err) {
-               printk(KERN_WARNING "%s: device_register returned %i\n",
-                      __func__, err);
-               bus_unregister(&ibmebus_bus_type);
-
-               return err;
-       }
-
-       err = ibmebus_create_devices(ibmebus_matches);
-       if (err) {
-               device_unregister(&ibmebus_bus_device);
-               bus_unregister(&ibmebus_bus_type);
-               return err;
-       }
-
-       return 0;
-}
-postcore_initcall(ibmebus_bus_init);
diff --git a/arch/powerpc/kernel/kexec_elf_64.c b/arch/powerpc/kernel/kexec_elf_64.c
new file mode 100644 (file)
index 0000000..6acffd3
--- /dev/null
@@ -0,0 +1,663 @@
+/*
+ * Load ELF vmlinux file for the kexec_file_load syscall.
+ *
+ * Copyright (C) 2004  Adam Litke (agl@us.ibm.com)
+ * Copyright (C) 2004  IBM Corp.
+ * Copyright (C) 2005  R Sharada (sharada@in.ibm.com)
+ * Copyright (C) 2006  Mohan Kumar M (mohan@in.ibm.com)
+ * Copyright (C) 2016  IBM Corporation
+ *
+ * Based on kexec-tools' kexec-elf-exec.c and kexec-elf-ppc64.c.
+ * Heavily modified for the kernel by
+ * Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation (version 2 of the License).
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#define pr_fmt(fmt)    "kexec_elf: " fmt
+
+#include <linux/elf.h>
+#include <linux/kexec.h>
+#include <linux/libfdt.h>
+#include <linux/module.h>
+#include <linux/of_fdt.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#define PURGATORY_STACK_SIZE   (16 * 1024)
+
+#define elf_addr_to_cpu        elf64_to_cpu
+
+#ifndef Elf_Rel
+#define Elf_Rel                Elf64_Rel
+#endif /* Elf_Rel */
+
+struct elf_info {
+       /*
+        * Where the ELF binary contents are kept.
+        * Memory managed by the user of the struct.
+        */
+       const char *buffer;
+
+       const struct elfhdr *ehdr;
+       const struct elf_phdr *proghdrs;
+       struct elf_shdr *sechdrs;
+};
+
+static inline bool elf_is_elf_file(const struct elfhdr *ehdr)
+{
+       return memcmp(ehdr->e_ident, ELFMAG, SELFMAG) == 0;
+}
+
+static uint64_t elf64_to_cpu(const struct elfhdr *ehdr, uint64_t value)
+{
+       if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB)
+               value = le64_to_cpu(value);
+       else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB)
+               value = be64_to_cpu(value);
+
+       return value;
+}
+
+static uint16_t elf16_to_cpu(const struct elfhdr *ehdr, uint16_t value)
+{
+       if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB)
+               value = le16_to_cpu(value);
+       else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB)
+               value = be16_to_cpu(value);
+
+       return value;
+}
+
+static uint32_t elf32_to_cpu(const struct elfhdr *ehdr, uint32_t value)
+{
+       if (ehdr->e_ident[EI_DATA] == ELFDATA2LSB)
+               value = le32_to_cpu(value);
+       else if (ehdr->e_ident[EI_DATA] == ELFDATA2MSB)
+               value = be32_to_cpu(value);
+
+       return value;
+}
+
+/**
+ * elf_is_ehdr_sane - check that it is safe to use the ELF header
+ * @buf_len:   size of the buffer in which the ELF file is loaded.
+ */
+static bool elf_is_ehdr_sane(const struct elfhdr *ehdr, size_t buf_len)
+{
+       if (ehdr->e_phnum > 0 && ehdr->e_phentsize != sizeof(struct elf_phdr)) {
+               pr_debug("Bad program header size.\n");
+               return false;
+       } else if (ehdr->e_shnum > 0 &&
+                  ehdr->e_shentsize != sizeof(struct elf_shdr)) {
+               pr_debug("Bad section header size.\n");
+               return false;
+       } else if (ehdr->e_ident[EI_VERSION] != EV_CURRENT ||
+                  ehdr->e_version != EV_CURRENT) {
+               pr_debug("Unknown ELF version.\n");
+               return false;
+       }
+
+       if (ehdr->e_phoff > 0 && ehdr->e_phnum > 0) {
+               size_t phdr_size;
+
+               /*
+                * e_phnum is at most 65535 so calculating the size of the
+                * program header cannot overflow.
+                */
+               phdr_size = sizeof(struct elf_phdr) * ehdr->e_phnum;
+
+               /* Sanity check the program header table location. */
+               if (ehdr->e_phoff + phdr_size < ehdr->e_phoff) {
+                       pr_debug("Program headers at invalid location.\n");
+                       return false;
+               } else if (ehdr->e_phoff + phdr_size > buf_len) {
+                       pr_debug("Program headers truncated.\n");
+                       return false;
+               }
+       }
+
+       if (ehdr->e_shoff > 0 && ehdr->e_shnum > 0) {
+               size_t shdr_size;
+
+               /*
+                * e_shnum is at most 65536 so calculating
+                * the size of the section header cannot overflow.
+                */
+               shdr_size = sizeof(struct elf_shdr) * ehdr->e_shnum;
+
+               /* Sanity check the section header table location. */
+               if (ehdr->e_shoff + shdr_size < ehdr->e_shoff) {
+                       pr_debug("Section headers at invalid location.\n");
+                       return false;
+               } else if (ehdr->e_shoff + shdr_size > buf_len) {
+                       pr_debug("Section headers truncated.\n");
+                       return false;
+               }
+       }
+
+       return true;
+}
+
+static int elf_read_ehdr(const char *buf, size_t len, struct elfhdr *ehdr)
+{
+       struct elfhdr *buf_ehdr;
+
+       if (len < sizeof(*buf_ehdr)) {
+               pr_debug("Buffer is too small to hold ELF header.\n");
+               return -ENOEXEC;
+       }
+
+       memset(ehdr, 0, sizeof(*ehdr));
+       memcpy(ehdr->e_ident, buf, sizeof(ehdr->e_ident));
+       if (!elf_is_elf_file(ehdr)) {
+               pr_debug("No ELF header magic.\n");
+               return -ENOEXEC;
+       }
+
+       if (ehdr->e_ident[EI_CLASS] != ELF_CLASS) {
+               pr_debug("Not a supported ELF class.\n");
+               return -ENOEXEC;
+       } else  if (ehdr->e_ident[EI_DATA] != ELFDATA2LSB &&
+               ehdr->e_ident[EI_DATA] != ELFDATA2MSB) {
+               pr_debug("Not a supported ELF data format.\n");
+               return -ENOEXEC;
+       }
+
+       buf_ehdr = (struct elfhdr *) buf;
+       if (elf16_to_cpu(ehdr, buf_ehdr->e_ehsize) != sizeof(*buf_ehdr)) {
+               pr_debug("Bad ELF header size.\n");
+               return -ENOEXEC;
+       }
+
+       ehdr->e_type      = elf16_to_cpu(ehdr, buf_ehdr->e_type);
+       ehdr->e_machine   = elf16_to_cpu(ehdr, buf_ehdr->e_machine);
+       ehdr->e_version   = elf32_to_cpu(ehdr, buf_ehdr->e_version);
+       ehdr->e_entry     = elf_addr_to_cpu(ehdr, buf_ehdr->e_entry);
+       ehdr->e_phoff     = elf_addr_to_cpu(ehdr, buf_ehdr->e_phoff);
+       ehdr->e_shoff     = elf_addr_to_cpu(ehdr, buf_ehdr->e_shoff);
+       ehdr->e_flags     = elf32_to_cpu(ehdr, buf_ehdr->e_flags);
+       ehdr->e_phentsize = elf16_to_cpu(ehdr, buf_ehdr->e_phentsize);
+       ehdr->e_phnum     = elf16_to_cpu(ehdr, buf_ehdr->e_phnum);
+       ehdr->e_shentsize = elf16_to_cpu(ehdr, buf_ehdr->e_shentsize);
+       ehdr->e_shnum     = elf16_to_cpu(ehdr, buf_ehdr->e_shnum);
+       ehdr->e_shstrndx  = elf16_to_cpu(ehdr, buf_ehdr->e_shstrndx);
+
+       return elf_is_ehdr_sane(ehdr, len) ? 0 : -ENOEXEC;
+}
+
+/**
+ * elf_is_phdr_sane - check that it is safe to use the program header
+ * @buf_len:   size of the buffer in which the ELF file is loaded.
+ */
+static bool elf_is_phdr_sane(const struct elf_phdr *phdr, size_t buf_len)
+{
+
+       if (phdr->p_offset + phdr->p_filesz < phdr->p_offset) {
+               pr_debug("ELF segment location wraps around.\n");
+               return false;
+       } else if (phdr->p_offset + phdr->p_filesz > buf_len) {
+               pr_debug("ELF segment not in file.\n");
+               return false;
+       } else if (phdr->p_paddr + phdr->p_memsz < phdr->p_paddr) {
+               pr_debug("ELF segment address wraps around.\n");
+               return false;
+       }
+
+       return true;
+}
+
+static int elf_read_phdr(const char *buf, size_t len, struct elf_info *elf_info,
+                        int idx)
+{
+       /* Override the const in proghdrs, we are the ones doing the loading. */
+       struct elf_phdr *phdr = (struct elf_phdr *) &elf_info->proghdrs[idx];
+       const char *pbuf;
+       struct elf_phdr *buf_phdr;
+
+       pbuf = buf + elf_info->ehdr->e_phoff + (idx * sizeof(*buf_phdr));
+       buf_phdr = (struct elf_phdr *) pbuf;
+
+       phdr->p_type   = elf32_to_cpu(elf_info->ehdr, buf_phdr->p_type);
+       phdr->p_offset = elf_addr_to_cpu(elf_info->ehdr, buf_phdr->p_offset);
+       phdr->p_paddr  = elf_addr_to_cpu(elf_info->ehdr, buf_phdr->p_paddr);
+       phdr->p_vaddr  = elf_addr_to_cpu(elf_info->ehdr, buf_phdr->p_vaddr);
+       phdr->p_flags  = elf32_to_cpu(elf_info->ehdr, buf_phdr->p_flags);
+
+       /*
+        * The following fields have a type equivalent to Elf_Addr
+        * both in 32 bit and 64 bit ELF.
+        */
+       phdr->p_filesz = elf_addr_to_cpu(elf_info->ehdr, buf_phdr->p_filesz);
+       phdr->p_memsz  = elf_addr_to_cpu(elf_info->ehdr, buf_phdr->p_memsz);
+       phdr->p_align  = elf_addr_to_cpu(elf_info->ehdr, buf_phdr->p_align);
+
+       return elf_is_phdr_sane(phdr, len) ? 0 : -ENOEXEC;
+}
+
+/**
+ * elf_read_phdrs - read the program headers from the buffer
+ *
+ * This function assumes that the program header table was checked for sanity.
+ * Use elf_is_ehdr_sane() if it wasn't.
+ */
+static int elf_read_phdrs(const char *buf, size_t len,
+                         struct elf_info *elf_info)
+{
+       size_t phdr_size, i;
+       const struct elfhdr *ehdr = elf_info->ehdr;
+
+       /*
+        * e_phnum is at most 65535 so calculating the size of the
+        * program header cannot overflow.
+        */
+       phdr_size = sizeof(struct elf_phdr) * ehdr->e_phnum;
+
+       elf_info->proghdrs = kzalloc(phdr_size, GFP_KERNEL);
+       if (!elf_info->proghdrs)
+               return -ENOMEM;
+
+       for (i = 0; i < ehdr->e_phnum; i++) {
+               int ret;
+
+               ret = elf_read_phdr(buf, len, elf_info, i);
+               if (ret) {
+                       kfree(elf_info->proghdrs);
+                       elf_info->proghdrs = NULL;
+                       return ret;
+               }
+       }
+
+       return 0;
+}
+
+/**
+ * elf_is_shdr_sane - check that it is safe to use the section header
+ * @buf_len:   size of the buffer in which the ELF file is loaded.
+ */
+static bool elf_is_shdr_sane(const struct elf_shdr *shdr, size_t buf_len)
+{
+       bool size_ok;
+
+       /* SHT_NULL headers have undefined values, so we can't check them. */
+       if (shdr->sh_type == SHT_NULL)
+               return true;
+
+       /* Now verify sh_entsize */
+       switch (shdr->sh_type) {
+       case SHT_SYMTAB:
+               size_ok = shdr->sh_entsize == sizeof(Elf_Sym);
+               break;
+       case SHT_RELA:
+               size_ok = shdr->sh_entsize == sizeof(Elf_Rela);
+               break;
+       case SHT_DYNAMIC:
+               size_ok = shdr->sh_entsize == sizeof(Elf_Dyn);
+               break;
+       case SHT_REL:
+               size_ok = shdr->sh_entsize == sizeof(Elf_Rel);
+               break;
+       case SHT_NOTE:
+       case SHT_PROGBITS:
+       case SHT_HASH:
+       case SHT_NOBITS:
+       default:
+               /*
+                * This is a section whose entsize requirements
+                * I don't care about.  If I don't know about
+                * the section I can't care about it's entsize
+                * requirements.
+                */
+               size_ok = true;
+               break;
+       }
+
+       if (!size_ok) {
+               pr_debug("ELF section with wrong entry size.\n");
+               return false;
+       } else if (shdr->sh_addr + shdr->sh_size < shdr->sh_addr) {
+               pr_debug("ELF section address wraps around.\n");
+               return false;
+       }
+
+       if (shdr->sh_type != SHT_NOBITS) {
+               if (shdr->sh_offset + shdr->sh_size < shdr->sh_offset) {
+                       pr_debug("ELF section location wraps around.\n");
+                       return false;
+               } else if (shdr->sh_offset + shdr->sh_size > buf_len) {
+                       pr_debug("ELF section not in file.\n");
+                       return false;
+               }
+       }
+
+       return true;
+}
+
+static int elf_read_shdr(const char *buf, size_t len, struct elf_info *elf_info,
+                        int idx)
+{
+       struct elf_shdr *shdr = &elf_info->sechdrs[idx];
+       const struct elfhdr *ehdr = elf_info->ehdr;
+       const char *sbuf;
+       struct elf_shdr *buf_shdr;
+
+       sbuf = buf + ehdr->e_shoff + idx * sizeof(*buf_shdr);
+       buf_shdr = (struct elf_shdr *) sbuf;
+
+       shdr->sh_name      = elf32_to_cpu(ehdr, buf_shdr->sh_name);
+       shdr->sh_type      = elf32_to_cpu(ehdr, buf_shdr->sh_type);
+       shdr->sh_addr      = elf_addr_to_cpu(ehdr, buf_shdr->sh_addr);
+       shdr->sh_offset    = elf_addr_to_cpu(ehdr, buf_shdr->sh_offset);
+       shdr->sh_link      = elf32_to_cpu(ehdr, buf_shdr->sh_link);
+       shdr->sh_info      = elf32_to_cpu(ehdr, buf_shdr->sh_info);
+
+       /*
+        * The following fields have a type equivalent to Elf_Addr
+        * both in 32 bit and 64 bit ELF.
+        */
+       shdr->sh_flags     = elf_addr_to_cpu(ehdr, buf_shdr->sh_flags);
+       shdr->sh_size      = elf_addr_to_cpu(ehdr, buf_shdr->sh_size);
+       shdr->sh_addralign = elf_addr_to_cpu(ehdr, buf_shdr->sh_addralign);
+       shdr->sh_entsize   = elf_addr_to_cpu(ehdr, buf_shdr->sh_entsize);
+
+       return elf_is_shdr_sane(shdr, len) ? 0 : -ENOEXEC;
+}
+
+/**
+ * elf_read_shdrs - read the section headers from the buffer
+ *
+ * This function assumes that the section header table was checked for sanity.
+ * Use elf_is_ehdr_sane() if it wasn't.
+ */
+static int elf_read_shdrs(const char *buf, size_t len,
+                         struct elf_info *elf_info)
+{
+       size_t shdr_size, i;
+
+       /*
+        * e_shnum is at most 65536 so calculating
+        * the size of the section header cannot overflow.
+        */
+       shdr_size = sizeof(struct elf_shdr) * elf_info->ehdr->e_shnum;
+
+       elf_info->sechdrs = kzalloc(shdr_size, GFP_KERNEL);
+       if (!elf_info->sechdrs)
+               return -ENOMEM;
+
+       for (i = 0; i < elf_info->ehdr->e_shnum; i++) {
+               int ret;
+
+               ret = elf_read_shdr(buf, len, elf_info, i);
+               if (ret) {
+                       kfree(elf_info->sechdrs);
+                       elf_info->sechdrs = NULL;
+                       return ret;
+               }
+       }
+
+       return 0;
+}
+
+/**
+ * elf_read_from_buffer - read ELF file and sets up ELF header and ELF info
+ * @buf:       Buffer to read ELF file from.
+ * @len:       Size of @buf.
+ * @ehdr:      Pointer to existing struct which will be populated.
+ * @elf_info:  Pointer to existing struct which will be populated.
+ *
+ * This function allows reading ELF files with different byte order than
+ * the kernel, byte-swapping the fields as needed.
+ *
+ * Return:
+ * On success returns 0, and the caller should call elf_free_info(elf_info) to
+ * free the memory allocated for the section and program headers.
+ */
+int elf_read_from_buffer(const char *buf, size_t len, struct elfhdr *ehdr,
+                        struct elf_info *elf_info)
+{
+       int ret;
+
+       ret = elf_read_ehdr(buf, len, ehdr);
+       if (ret)
+               return ret;
+
+       elf_info->buffer = buf;
+       elf_info->ehdr = ehdr;
+       if (ehdr->e_phoff > 0 && ehdr->e_phnum > 0) {
+               ret = elf_read_phdrs(buf, len, elf_info);
+               if (ret)
+                       return ret;
+       }
+       if (ehdr->e_shoff > 0 && ehdr->e_shnum > 0) {
+               ret = elf_read_shdrs(buf, len, elf_info);
+               if (ret) {
+                       kfree(elf_info->proghdrs);
+                       return ret;
+               }
+       }
+
+       return 0;
+}
+
+/**
+ * elf_free_info - free memory allocated by elf_read_from_buffer
+ */
+void elf_free_info(struct elf_info *elf_info)
+{
+       kfree(elf_info->proghdrs);
+       kfree(elf_info->sechdrs);
+       memset(elf_info, 0, sizeof(*elf_info));
+}
+/**
+ * build_elf_exec_info - read ELF executable and check that we can use it
+ */
+static int build_elf_exec_info(const char *buf, size_t len, struct elfhdr *ehdr,
+                              struct elf_info *elf_info)
+{
+       int i;
+       int ret;
+
+       ret = elf_read_from_buffer(buf, len, ehdr, elf_info);
+       if (ret)
+               return ret;
+
+       /* Big endian vmlinux has type ET_DYN. */
+       if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) {
+               pr_err("Not an ELF executable.\n");
+               goto error;
+       } else if (!elf_info->proghdrs) {
+               pr_err("No ELF program header.\n");
+               goto error;
+       }
+
+       for (i = 0; i < ehdr->e_phnum; i++) {
+               /*
+                * Kexec does not support loading interpreters.
+                * In addition this check keeps us from attempting
+                * to kexec ordinay executables.
+                */
+               if (elf_info->proghdrs[i].p_type == PT_INTERP) {
+                       pr_err("Requires an ELF interpreter.\n");
+                       goto error;
+               }
+       }
+
+       return 0;
+error:
+       elf_free_info(elf_info);
+       return -ENOEXEC;
+}
+
+static int elf64_probe(const char *buf, unsigned long len)
+{
+       struct elfhdr ehdr;
+       struct elf_info elf_info;
+       int ret;
+
+       ret = build_elf_exec_info(buf, len, &ehdr, &elf_info);
+       if (ret)
+               return ret;
+
+       elf_free_info(&elf_info);
+
+       return elf_check_arch(&ehdr) ? 0 : -ENOEXEC;
+}
+
+/**
+ * elf_exec_load - load ELF executable image
+ * @lowest_load_addr:  On return, will be the address where the first PT_LOAD
+ *                     section will be loaded in memory.
+ *
+ * Return:
+ * 0 on success, negative value on failure.
+ */
+static int elf_exec_load(struct kimage *image, struct elfhdr *ehdr,
+                        struct elf_info *elf_info,
+                        unsigned long *lowest_load_addr)
+{
+       unsigned long base = 0, lowest_addr = UINT_MAX;
+       int ret;
+       size_t i;
+       struct kexec_buf kbuf = { .image = image, .buf_max = ppc64_rma_size,
+                                 .top_down = false };
+
+       /* Read in the PT_LOAD segments. */
+       for (i = 0; i < ehdr->e_phnum; i++) {
+               unsigned long load_addr;
+               size_t size;
+               const struct elf_phdr *phdr;
+
+               phdr = &elf_info->proghdrs[i];
+               if (phdr->p_type != PT_LOAD)
+                       continue;
+
+               size = phdr->p_filesz;
+               if (size > phdr->p_memsz)
+                       size = phdr->p_memsz;
+
+               kbuf.buffer = (void *) elf_info->buffer + phdr->p_offset;
+               kbuf.bufsz = size;
+               kbuf.memsz = phdr->p_memsz;
+               kbuf.buf_align = phdr->p_align;
+               kbuf.buf_min = phdr->p_paddr + base;
+               ret = kexec_add_buffer(&kbuf);
+               if (ret)
+                       goto out;
+               load_addr = kbuf.mem;
+
+               if (load_addr < lowest_addr)
+                       lowest_addr = load_addr;
+       }
+
+       /* Update entry point to reflect new load address. */
+       ehdr->e_entry += base;
+
+       *lowest_load_addr = lowest_addr;
+       ret = 0;
+ out:
+       return ret;
+}
+
+static void *elf64_load(struct kimage *image, char *kernel_buf,
+                       unsigned long kernel_len, char *initrd,
+                       unsigned long initrd_len, char *cmdline,
+                       unsigned long cmdline_len)
+{
+       int ret;
+       unsigned int fdt_size;
+       unsigned long kernel_load_addr, purgatory_load_addr;
+       unsigned long initrd_load_addr = 0, fdt_load_addr;
+       void *fdt;
+       const void *slave_code;
+       struct elfhdr ehdr;
+       struct elf_info elf_info;
+       struct kexec_buf kbuf = { .image = image, .buf_min = 0,
+                                 .buf_max = ppc64_rma_size };
+
+       ret = build_elf_exec_info(kernel_buf, kernel_len, &ehdr, &elf_info);
+       if (ret)
+               goto out;
+
+       ret = elf_exec_load(image, &ehdr, &elf_info, &kernel_load_addr);
+       if (ret)
+               goto out;
+
+       pr_debug("Loaded the kernel at 0x%lx\n", kernel_load_addr);
+
+       ret = kexec_load_purgatory(image, 0, ppc64_rma_size, true,
+                                  &purgatory_load_addr);
+       if (ret) {
+               pr_err("Loading purgatory failed.\n");
+               goto out;
+       }
+
+       pr_debug("Loaded purgatory at 0x%lx\n", purgatory_load_addr);
+
+       if (initrd != NULL) {
+               kbuf.buffer = initrd;
+               kbuf.bufsz = kbuf.memsz = initrd_len;
+               kbuf.buf_align = PAGE_SIZE;
+               kbuf.top_down = false;
+               ret = kexec_add_buffer(&kbuf);
+               if (ret)
+                       goto out;
+               initrd_load_addr = kbuf.mem;
+
+               pr_debug("Loaded initrd at 0x%lx\n", initrd_load_addr);
+       }
+
+       fdt_size = fdt_totalsize(initial_boot_params) * 2;
+       fdt = kmalloc(fdt_size, GFP_KERNEL);
+       if (!fdt) {
+               pr_err("Not enough memory for the device tree.\n");
+               ret = -ENOMEM;
+               goto out;
+       }
+       ret = fdt_open_into(initial_boot_params, fdt, fdt_size);
+       if (ret < 0) {
+               pr_err("Error setting up the new device tree.\n");
+               ret = -EINVAL;
+               goto out;
+       }
+
+       ret = setup_new_fdt(fdt, initrd_load_addr, initrd_len, cmdline);
+       if (ret)
+               goto out;
+
+       fdt_pack(fdt);
+
+       kbuf.buffer = fdt;
+       kbuf.bufsz = kbuf.memsz = fdt_size;
+       kbuf.buf_align = PAGE_SIZE;
+       kbuf.top_down = true;
+       ret = kexec_add_buffer(&kbuf);
+       if (ret)
+               goto out;
+       fdt_load_addr = kbuf.mem;
+
+       pr_debug("Loaded device tree at 0x%lx\n", fdt_load_addr);
+
+       slave_code = elf_info.buffer + elf_info.proghdrs[0].p_offset;
+       ret = setup_purgatory(image, slave_code, fdt, kernel_load_addr,
+                             fdt_load_addr);
+       if (ret)
+               pr_err("Error setting up the purgatory.\n");
+
+out:
+       elf_free_info(&elf_info);
+
+       /* Make kimage_file_post_load_cleanup free the fdt buffer for us. */
+       return ret ? ERR_PTR(ret) : fdt;
+}
+
+struct kexec_file_ops kexec_elf64_ops = {
+       .probe = elf64_probe,
+       .load = elf64_load,
+};
index 9479d8e360cfe9b6ba8d677072898dec25e7b5a5..ad108b842669566130cc2734e947f51203f55914 100644 (file)
@@ -140,13 +140,16 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
        regs->link = (unsigned long)kretprobe_trampoline;
 }
 
-static int __kprobes kprobe_handler(struct pt_regs *regs)
+int __kprobes kprobe_handler(struct pt_regs *regs)
 {
        struct kprobe *p;
        int ret = 0;
        unsigned int *addr = (unsigned int *)regs->nip;
        struct kprobe_ctlblk *kcb;
 
+       if (user_mode(regs))
+               return 0;
+
        /*
         * We don't want to be preempted for the entire
         * duration of kprobe processing
@@ -359,12 +362,12 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p,
  * single-stepped a copy of the instruction.  The address of this
  * copy is p->ainsn.insn.
  */
-static int __kprobes post_kprobe_handler(struct pt_regs *regs)
+int __kprobes kprobe_post_handler(struct pt_regs *regs)
 {
        struct kprobe *cur = kprobe_running();
        struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 
-       if (!cur)
+       if (!cur || user_mode(regs))
                return 0;
 
        /* make sure we got here for instruction we have a kprobe on */
@@ -470,25 +473,7 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
                                       unsigned long val, void *data)
 {
-       struct die_args *args = (struct die_args *)data;
-       int ret = NOTIFY_DONE;
-
-       if (args->regs && user_mode(args->regs))
-               return ret;
-
-       switch (val) {
-       case DIE_BPT:
-               if (kprobe_handler(args->regs))
-                       ret = NOTIFY_STOP;
-               break;
-       case DIE_SSTEP:
-               if (post_kprobe_handler(args->regs))
-                       ret = NOTIFY_STOP;
-               break;
-       default:
-               break;
-       }
-       return ret;
+       return NOTIFY_DONE;
 }
 
 unsigned long arch_deref_entry_point(void *entry)
index a205fa3d9bf3aef238c3e58655b7cc2f21430736..5c12e21d0d1a1976ac399a618719bb5672247d97 100644 (file)
@@ -310,7 +310,7 @@ void default_machine_kexec(struct kimage *image)
        if (!kdump_in_progress())
                kexec_prepare_cpus();
 
-       pr_debug("kexec: Starting switchover sequence.\n");
+       printk("kexec: Starting switchover sequence.\n");
 
        /* switch to a staticly allocated stack.  Based on irq stack code.
         * We setup preempt_count to avoid using VMX in memcpy.
diff --git a/arch/powerpc/kernel/machine_kexec_file_64.c b/arch/powerpc/kernel/machine_kexec_file_64.c
new file mode 100644 (file)
index 0000000..7abc8a7
--- /dev/null
@@ -0,0 +1,338 @@
+/*
+ * ppc64 code to implement the kexec_file_load syscall
+ *
+ * Copyright (C) 2004  Adam Litke (agl@us.ibm.com)
+ * Copyright (C) 2004  IBM Corp.
+ * Copyright (C) 2004,2005  Milton D Miller II, IBM Corporation
+ * Copyright (C) 2005  R Sharada (sharada@in.ibm.com)
+ * Copyright (C) 2006  Mohan Kumar M (mohan@in.ibm.com)
+ * Copyright (C) 2016  IBM Corporation
+ *
+ * Based on kexec-tools' kexec-elf-ppc64.c, fs2dt.c.
+ * Heavily modified for the kernel by
+ * Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation (version 2 of the License).
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/slab.h>
+#include <linux/kexec.h>
+#include <linux/memblock.h>
+#include <linux/of_fdt.h>
+#include <linux/libfdt.h>
+
+#define SLAVE_CODE_SIZE                256
+
+static struct kexec_file_ops *kexec_file_loaders[] = {
+       &kexec_elf64_ops,
+};
+
+int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+                                 unsigned long buf_len)
+{
+       int i, ret = -ENOEXEC;
+       struct kexec_file_ops *fops;
+
+       /* We don't support crash kernels yet. */
+       if (image->type == KEXEC_TYPE_CRASH)
+               return -ENOTSUPP;
+
+       for (i = 0; i < ARRAY_SIZE(kexec_file_loaders); i++) {
+               fops = kexec_file_loaders[i];
+               if (!fops || !fops->probe)
+                       continue;
+
+               ret = fops->probe(buf, buf_len);
+               if (!ret) {
+                       image->fops = fops;
+                       return ret;
+               }
+       }
+
+       return ret;
+}
+
+void *arch_kexec_kernel_image_load(struct kimage *image)
+{
+       if (!image->fops || !image->fops->load)
+               return ERR_PTR(-ENOEXEC);
+
+       return image->fops->load(image, image->kernel_buf,
+                                image->kernel_buf_len, image->initrd_buf,
+                                image->initrd_buf_len, image->cmdline_buf,
+                                image->cmdline_buf_len);
+}
+
+int arch_kimage_file_post_load_cleanup(struct kimage *image)
+{
+       if (!image->fops || !image->fops->cleanup)
+               return 0;
+
+       return image->fops->cleanup(image->image_loader_data);
+}
+
+/**
+ * arch_kexec_walk_mem - call func(data) for each unreserved memory block
+ * @kbuf:      Context info for the search. Also passed to @func.
+ * @func:      Function to call for each memory block.
+ *
+ * This function is used by kexec_add_buffer and kexec_locate_mem_hole
+ * to find unreserved memory to load kexec segments into.
+ *
+ * Return: The memory walk will stop when func returns a non-zero value
+ * and that value will be returned. If all free regions are visited without
+ * func returning non-zero, then zero will be returned.
+ */
+int arch_kexec_walk_mem(struct kexec_buf *kbuf, int (*func)(u64, u64, void *))
+{
+       int ret = 0;
+       u64 i;
+       phys_addr_t mstart, mend;
+
+       if (kbuf->top_down) {
+               for_each_free_mem_range_reverse(i, NUMA_NO_NODE, 0,
+                                               &mstart, &mend, NULL) {
+                       /*
+                        * In memblock, end points to the first byte after the
+                        * range while in kexec, end points to the last byte
+                        * in the range.
+                        */
+                       ret = func(mstart, mend - 1, kbuf);
+                       if (ret)
+                               break;
+               }
+       } else {
+               for_each_free_mem_range(i, NUMA_NO_NODE, 0, &mstart, &mend,
+                                       NULL) {
+                       /*
+                        * In memblock, end points to the first byte after the
+                        * range while in kexec, end points to the last byte
+                        * in the range.
+                        */
+                       ret = func(mstart, mend - 1, kbuf);
+                       if (ret)
+                               break;
+               }
+       }
+
+       return ret;
+}
+
+/**
+ * setup_purgatory - initialize the purgatory's global variables
+ * @image:             kexec image.
+ * @slave_code:                Slave code for the purgatory.
+ * @fdt:               Flattened device tree for the next kernel.
+ * @kernel_load_addr:  Address where the kernel is loaded.
+ * @fdt_load_addr:     Address where the flattened device tree is loaded.
+ *
+ * Return: 0 on success, or negative errno on error.
+ */
+int setup_purgatory(struct kimage *image, const void *slave_code,
+                   const void *fdt, unsigned long kernel_load_addr,
+                   unsigned long fdt_load_addr)
+{
+       unsigned int *slave_code_buf, master_entry;
+       int ret;
+
+       slave_code_buf = kmalloc(SLAVE_CODE_SIZE, GFP_KERNEL);
+       if (!slave_code_buf)
+               return -ENOMEM;
+
+       /* Get the slave code from the new kernel and put it in purgatory. */
+       ret = kexec_purgatory_get_set_symbol(image, "purgatory_start",
+                                            slave_code_buf, SLAVE_CODE_SIZE,
+                                            true);
+       if (ret) {
+               kfree(slave_code_buf);
+               return ret;
+       }
+
+       master_entry = slave_code_buf[0];
+       memcpy(slave_code_buf, slave_code, SLAVE_CODE_SIZE);
+       slave_code_buf[0] = master_entry;
+       ret = kexec_purgatory_get_set_symbol(image, "purgatory_start",
+                                            slave_code_buf, SLAVE_CODE_SIZE,
+                                            false);
+       kfree(slave_code_buf);
+
+       ret = kexec_purgatory_get_set_symbol(image, "kernel", &kernel_load_addr,
+                                            sizeof(kernel_load_addr), false);
+       if (ret)
+               return ret;
+       ret = kexec_purgatory_get_set_symbol(image, "dt_offset", &fdt_load_addr,
+                                            sizeof(fdt_load_addr), false);
+       if (ret)
+               return ret;
+
+       return 0;
+}
+
+/**
+ * delete_fdt_mem_rsv - delete memory reservation with given address and size
+ *
+ * Return: 0 on success, or negative errno on error.
+ */
+static int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size)
+{
+       int i, ret, num_rsvs = fdt_num_mem_rsv(fdt);
+
+       for (i = 0; i < num_rsvs; i++) {
+               uint64_t rsv_start, rsv_size;
+
+               ret = fdt_get_mem_rsv(fdt, i, &rsv_start, &rsv_size);
+               if (ret) {
+                       pr_err("Malformed device tree.\n");
+                       return -EINVAL;
+               }
+
+               if (rsv_start == start && rsv_size == size) {
+                       ret = fdt_del_mem_rsv(fdt, i);
+                       if (ret) {
+                               pr_err("Error deleting device tree reservation.\n");
+                               return -EINVAL;
+                       }
+
+                       return 0;
+               }
+       }
+
+       return -ENOENT;
+}
+
+/*
+ * setup_new_fdt - modify /chosen and memory reservation for the next kernel
+ * @fdt:               Flattened device tree for the next kernel.
+ * @initrd_load_addr:  Address where the next initrd will be loaded.
+ * @initrd_len:                Size of the next initrd, or 0 if there will be none.
+ * @cmdline:           Command line for the next kernel, or NULL if there will
+ *                     be none.
+ *
+ * Return: 0 on success, or negative errno on error.
+ */
+int setup_new_fdt(void *fdt, unsigned long initrd_load_addr,
+                 unsigned long initrd_len, const char *cmdline)
+{
+       int ret, chosen_node;
+       const void *prop;
+
+       /* Remove memory reservation for the current device tree. */
+       ret = delete_fdt_mem_rsv(fdt, __pa(initial_boot_params),
+                                fdt_totalsize(initial_boot_params));
+       if (ret == 0)
+               pr_debug("Removed old device tree reservation.\n");
+       else if (ret != -ENOENT)
+               return ret;
+
+       chosen_node = fdt_path_offset(fdt, "/chosen");
+       if (chosen_node == -FDT_ERR_NOTFOUND) {
+               chosen_node = fdt_add_subnode(fdt, fdt_path_offset(fdt, "/"),
+                                             "chosen");
+               if (chosen_node < 0) {
+                       pr_err("Error creating /chosen.\n");
+                       return -EINVAL;
+               }
+       } else if (chosen_node < 0) {
+               pr_err("Malformed device tree: error reading /chosen.\n");
+               return -EINVAL;
+       }
+
+       /* Did we boot using an initrd? */
+       prop = fdt_getprop(fdt, chosen_node, "linux,initrd-start", NULL);
+       if (prop) {
+               uint64_t tmp_start, tmp_end, tmp_size;
+
+               tmp_start = fdt64_to_cpu(*((const fdt64_t *) prop));
+
+               prop = fdt_getprop(fdt, chosen_node, "linux,initrd-end", NULL);
+               if (!prop) {
+                       pr_err("Malformed device tree.\n");
+                       return -EINVAL;
+               }
+               tmp_end = fdt64_to_cpu(*((const fdt64_t *) prop));
+
+               /*
+                * kexec reserves exact initrd size, while firmware may
+                * reserve a multiple of PAGE_SIZE, so check for both.
+                */
+               tmp_size = tmp_end - tmp_start;
+               ret = delete_fdt_mem_rsv(fdt, tmp_start, tmp_size);
+               if (ret == -ENOENT)
+                       ret = delete_fdt_mem_rsv(fdt, tmp_start,
+                                                round_up(tmp_size, PAGE_SIZE));
+               if (ret == 0)
+                       pr_debug("Removed old initrd reservation.\n");
+               else if (ret != -ENOENT)
+                       return ret;
+
+               /* If there's no new initrd, delete the old initrd's info. */
+               if (initrd_len == 0) {
+                       ret = fdt_delprop(fdt, chosen_node,
+                                         "linux,initrd-start");
+                       if (ret) {
+                               pr_err("Error deleting linux,initrd-start.\n");
+                               return -EINVAL;
+                       }
+
+                       ret = fdt_delprop(fdt, chosen_node, "linux,initrd-end");
+                       if (ret) {
+                               pr_err("Error deleting linux,initrd-end.\n");
+                               return -EINVAL;
+                       }
+               }
+       }
+
+       if (initrd_len) {
+               ret = fdt_setprop_u64(fdt, chosen_node,
+                                     "linux,initrd-start",
+                                     initrd_load_addr);
+               if (ret < 0) {
+                       pr_err("Error setting up the new device tree.\n");
+                       return -EINVAL;
+               }
+
+               /* initrd-end is the first address after the initrd image. */
+               ret = fdt_setprop_u64(fdt, chosen_node, "linux,initrd-end",
+                                     initrd_load_addr + initrd_len);
+               if (ret < 0) {
+                       pr_err("Error setting up the new device tree.\n");
+                       return -EINVAL;
+               }
+
+               ret = fdt_add_mem_rsv(fdt, initrd_load_addr, initrd_len);
+               if (ret) {
+                       pr_err("Error reserving initrd memory: %s\n",
+                              fdt_strerror(ret));
+                       return -EINVAL;
+               }
+       }
+
+       if (cmdline != NULL) {
+               ret = fdt_setprop_string(fdt, chosen_node, "bootargs", cmdline);
+               if (ret < 0) {
+                       pr_err("Error setting up the new device tree.\n");
+                       return -EINVAL;
+               }
+       } else {
+               ret = fdt_delprop(fdt, chosen_node, "bootargs");
+               if (ret && ret != -FDT_ERR_NOTFOUND) {
+                       pr_err("Error deleting bootargs.\n");
+                       return -EINVAL;
+               }
+       }
+
+       ret = fdt_setprop(fdt, chosen_node, "linux,booted-from-kexec", NULL, 0);
+       if (ret) {
+               pr_err("Error setting up the new device tree.\n");
+               return -EINVAL;
+       }
+
+       return 0;
+}
index 5e7ece0fda9f5b802eb561ce51774cfb625032a6..c6923ff451311bfade14e7f68888f85bb69f7176 100644 (file)
@@ -72,7 +72,6 @@ void save_mce_event(struct pt_regs *regs, long handled,
                    struct mce_error_info *mce_err,
                    uint64_t nip, uint64_t addr)
 {
-       uint64_t srr1;
        int index = __this_cpu_inc_return(mce_nest_count) - 1;
        struct machine_check_event *mce = this_cpu_ptr(&mce_event[index]);
 
@@ -99,8 +98,6 @@ void save_mce_event(struct pt_regs *regs, long handled,
                mce->disposition = MCE_DISPOSITION_NOT_RECOVERED;
        mce->severity = MCE_SEV_ERROR_SYNC;
 
-       srr1 = regs->msr;
-
        /*
         * Populate the mce error_type and type-specific error_type.
         */
index 93cf7a5846a6f5875534cc43a2a3409357dc482a..1863324c6a3c96db16028de56d35f0c2698f33e6 100644 (file)
@@ -614,7 +614,7 @@ _GLOBAL(start_secondary_resume)
 _GLOBAL(__main)
        blr
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        /*
         * Must be relocatable PIC code callable as a C function.
         */
index 4f178671f230ccd799a9089ca4389590e94bc703..32be2a844947436662b48e034ff42a2cad6fc3d9 100644 (file)
@@ -478,7 +478,7 @@ _GLOBAL(kexec_wait)
        addi    r5,r5,kexec_flag-1b
 
 99:    HMT_LOW
-#ifdef CONFIG_KEXEC            /* use no memory without kexec */
+#ifdef CONFIG_KEXEC_CORE       /* use no memory without kexec */
        lwz     r4,0(r5)
        cmpwi   0,r4,0
        beq     99b
@@ -503,7 +503,7 @@ kexec_flag:
        .long   0
 
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 #ifdef CONFIG_PPC_BOOK3E
 /*
  * BOOK3E has no real MMU mode, so we have to setup the initial TLB
@@ -716,4 +716,4 @@ _GLOBAL(kexec_sequence)
        mtlr    4
        li      r5,0
        blr     /* image->start(physid, image->start, 0); */
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
index b60a67d92ebd0493e67762b9394a02caab60738a..34aeac54f120053a1ff9f8f9311baf30f7b1a1f6 100644 (file)
@@ -114,11 +114,6 @@ static struct platform_driver of_pci_phb_driver = {
        },
 };
 
-static __init int of_pci_phb_init(void)
-{
-       return platform_driver_register(&of_pci_phb_driver);
-}
-
-device_initcall(of_pci_phb_init);
+builtin_platform_driver(of_pci_phb_driver);
 
 #endif /* CONFIG_PPC_OF_PLATFORM_PCI */
index 5b62e8c36210a3b55f19547aebf0a294c493f0e9..9da9a42595ceb42eb7cd069edd868b796ea21e59 100644 (file)
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
 
+#ifdef CONFIG_CC_STACKPROTECTOR
+#include <linux/stackprotector.h>
+unsigned long __stack_chk_guard __read_mostly;
+EXPORT_SYMBOL(__stack_chk_guard);
+#endif
+
 /* Transactional Memory debug */
 #ifdef TM_DEBUG_SW
 #define TM_DEBUG(x...) printk(KERN_INFO x)
index b0245bed6f54862d187ef5e12757e872be7fca5c..f5d399e461932c19a0ce01af7c3f976b3cc39eab 100644 (file)
@@ -156,21 +156,22 @@ static struct ibm_pa_feature {
        unsigned char   pabit;          /* bit number (big-endian) */
        unsigned char   invert;         /* if 1, pa bit set => clear feature */
 } ibm_pa_features[] __initdata = {
-       {0, 0, PPC_FEATURE_HAS_MMU, 0,          0, 0, 0},
-       {0, 0, PPC_FEATURE_HAS_FPU, 0,          0, 1, 0},
-       {CPU_FTR_CTRL, 0, 0, 0,                 0, 3, 0},
-       {CPU_FTR_NOEXECUTE, 0, 0, 0,            0, 6, 0},
-       {CPU_FTR_NODSISRALIGN, 0, 0, 0,         1, 1, 1},
-       {0, MMU_FTR_CI_LARGE_PAGE, 0, 0,                1, 2, 0},
-       {CPU_FTR_REAL_LE, 0, PPC_FEATURE_TRUE_LE, 0, 5, 0, 0},
+       { .pabyte = 0,  .pabit = 0, .cpu_user_ftrs = PPC_FEATURE_HAS_MMU },
+       { .pabyte = 0,  .pabit = 1, .cpu_user_ftrs = PPC_FEATURE_HAS_FPU },
+       { .pabyte = 0,  .pabit = 3, .cpu_features  = CPU_FTR_CTRL },
+       { .pabyte = 0,  .pabit = 6, .cpu_features  = CPU_FTR_NOEXECUTE },
+       { .pabyte = 1,  .pabit = 2, .mmu_features  = MMU_FTR_CI_LARGE_PAGE },
+       { .pabyte = 40, .pabit = 0, .mmu_features  = MMU_FTR_TYPE_RADIX },
+       { .pabyte = 1,  .pabit = 1, .invert = 1, .cpu_features = CPU_FTR_NODSISRALIGN },
+       { .pabyte = 5,  .pabit = 0, .cpu_features  = CPU_FTR_REAL_LE,
+                                   .cpu_user_ftrs = PPC_FEATURE_TRUE_LE },
        /*
         * If the kernel doesn't support TM (ie CONFIG_PPC_TRANSACTIONAL_MEM=n),
         * we don't want to turn on TM here, so we use the *_COMP versions
         * which are 0 if the kernel doesn't support TM.
         */
-       {CPU_FTR_TM_COMP, 0, 0,
-        PPC_FEATURE2_HTM_COMP|PPC_FEATURE2_HTM_NOSC_COMP, 22, 0, 0},
-       {0, MMU_FTR_TYPE_RADIX, 0, 0,           40, 0, 0},
+       { .pabyte = 22, .pabit = 0, .cpu_features = CPU_FTR_TM_COMP,
+         .cpu_user_ftrs2 = PPC_FEATURE2_HTM_COMP | PPC_FEATURE2_HTM_NOSC_COMP },
 };
 
 static void __init scan_features(unsigned long node, const unsigned char *ftrs,
@@ -427,7 +428,7 @@ static int __init early_init_dt_scan_chosen_ppc(unsigned long node,
                tce_alloc_end = *lprop;
 #endif
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        lprop = of_get_flat_dt_prop(node, "linux,crashkernel-base", NULL);
        if (lprop)
                crashk_res.start = *lprop;
index 88ac964f48586bb90f5b1daf19e9e192237dec6a..ec47a939cbdd6dd81c6c05ed6707f28e12d9f0ea 100644 (file)
@@ -461,14 +461,14 @@ static int __init prom_next_node(phandle *nodep)
        }
 }
 
-static int inline prom_getprop(phandle node, const char *pname,
+static inline int prom_getprop(phandle node, const char *pname,
                               void *value, size_t valuelen)
 {
        return call_prom("getprop", 4, 1, node, ADDR(pname),
                         (u32)(unsigned long) value, (u32) valuelen);
 }
 
-static int inline prom_getproplen(phandle node, const char *pname)
+static inline int prom_getproplen(phandle node, const char *pname)
 {
        return call_prom("getproplen", 2, 1, node, ADDR(pname));
 }
@@ -635,13 +635,7 @@ static void __init early_cmdline_parse(void)
  *
  * See prom.h for the definition of the bits specified in the
  * architecture vector.
- *
- * Because the description vector contains a mix of byte and word
- * values, we declare it as an unsigned char array, and use this
- * macro to put word values in.
  */
-#define W(x)   ((x) >> 24) & 0xff, ((x) >> 16) & 0xff, \
-               ((x) >> 8) & 0xff, (x) & 0xff
 
 /* Firmware expects the value to be n - 1, where n is the # of vectors */
 #define NUM_VECTORS(n)         ((n) - 1)
@@ -652,92 +646,205 @@ static void __init early_cmdline_parse(void)
  */
 #define VECTOR_LENGTH(n)       (1 + (n) - 2)
 
-unsigned char ibm_architecture_vec[] = {
-       W(0xfffe0000), W(0x003a0000),   /* POWER5/POWER5+ */
-       W(0xffff0000), W(0x003e0000),   /* POWER6 */
-       W(0xffff0000), W(0x003f0000),   /* POWER7 */
-       W(0xffff0000), W(0x004b0000),   /* POWER8E */
-       W(0xffff0000), W(0x004c0000),   /* POWER8NVL */
-       W(0xffff0000), W(0x004d0000),   /* POWER8 */
-       W(0xffffffff), W(0x0f000004),   /* all 2.07-compliant */
-       W(0xffffffff), W(0x0f000003),   /* all 2.06-compliant */
-       W(0xffffffff), W(0x0f000002),   /* all 2.05-compliant */
-       W(0xfffffffe), W(0x0f000001),   /* all 2.04-compliant and earlier */
-       NUM_VECTORS(6),                 /* 6 option vectors */
-
-       /* option vector 1: processor architectures supported */
-       VECTOR_LENGTH(2),               /* length */
-       0,                              /* don't ignore, don't halt */
-       OV1_PPC_2_00 | OV1_PPC_2_01 | OV1_PPC_2_02 | OV1_PPC_2_03 |
-       OV1_PPC_2_04 | OV1_PPC_2_05 | OV1_PPC_2_06 | OV1_PPC_2_07,
+struct option_vector1 {
+       u8 byte1;
+       u8 arch_versions;
+} __packed;
+
+struct option_vector2 {
+       u8 byte1;
+       __be16 reserved;
+       __be32 real_base;
+       __be32 real_size;
+       __be32 virt_base;
+       __be32 virt_size;
+       __be32 load_base;
+       __be32 min_rma;
+       __be32 min_load;
+       u8 min_rma_percent;
+       u8 max_pft_size;
+} __packed;
+
+struct option_vector3 {
+       u8 byte1;
+       u8 byte2;
+} __packed;
+
+struct option_vector4 {
+       u8 byte1;
+       u8 min_vp_cap;
+} __packed;
+
+struct option_vector5 {
+       u8 byte1;
+       u8 byte2;
+       u8 byte3;
+       u8 cmo;
+       u8 associativity;
+       u8 bin_opts;
+       u8 micro_checkpoint;
+       u8 reserved0;
+       __be32 max_cpus;
+       __be16 papr_level;
+       __be16 reserved1;
+       u8 platform_facilities;
+       u8 reserved2;
+       __be16 reserved3;
+       u8 subprocessors;
+} __packed;
+
+struct option_vector6 {
+       u8 reserved;
+       u8 secondary_pteg;
+       u8 os_name;
+} __packed;
+
+struct ibm_arch_vec {
+       struct { u32 mask, val; } pvrs[10];
+
+       u8 num_vectors;
+
+       u8 vec1_len;
+       struct option_vector1 vec1;
+
+       u8 vec2_len;
+       struct option_vector2 vec2;
+
+       u8 vec3_len;
+       struct option_vector3 vec3;
+
+       u8 vec4_len;
+       struct option_vector4 vec4;
+
+       u8 vec5_len;
+       struct option_vector5 vec5;
+
+       u8 vec6_len;
+       struct option_vector6 vec6;
+} __packed;
+
+struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = {
+       .pvrs = {
+               {
+                       .mask = cpu_to_be32(0xfffe0000), /* POWER5/POWER5+ */
+                       .val  = cpu_to_be32(0x003a0000),
+               },
+               {
+                       .mask = cpu_to_be32(0xffff0000), /* POWER6 */
+                       .val  = cpu_to_be32(0x003e0000),
+               },
+               {
+                       .mask = cpu_to_be32(0xffff0000), /* POWER7 */
+                       .val  = cpu_to_be32(0x003f0000),
+               },
+               {
+                       .mask = cpu_to_be32(0xffff0000), /* POWER8E */
+                       .val  = cpu_to_be32(0x004b0000),
+               },
+               {
+                       .mask = cpu_to_be32(0xffff0000), /* POWER8NVL */
+                       .val  = cpu_to_be32(0x004c0000),
+               },
+               {
+                       .mask = cpu_to_be32(0xffff0000), /* POWER8 */
+                       .val  = cpu_to_be32(0x004d0000),
+               },
+               {
+                       .mask = cpu_to_be32(0xffffffff), /* all 2.07-compliant */
+                       .val  = cpu_to_be32(0x0f000004),
+               },
+               {
+                       .mask = cpu_to_be32(0xffffffff), /* all 2.06-compliant */
+                       .val  = cpu_to_be32(0x0f000003),
+               },
+               {
+                       .mask = cpu_to_be32(0xffffffff), /* all 2.05-compliant */
+                       .val  = cpu_to_be32(0x0f000002),
+               },
+               {
+                       .mask = cpu_to_be32(0xfffffffe), /* all 2.04-compliant and earlier */
+                       .val  = cpu_to_be32(0x0f000001),
+               },
+       },
+
+       .num_vectors = NUM_VECTORS(6),
 
+       .vec1_len = VECTOR_LENGTH(sizeof(struct option_vector1)),
+       .vec1 = {
+               .byte1 = 0,
+               .arch_versions = OV1_PPC_2_00 | OV1_PPC_2_01 | OV1_PPC_2_02 | OV1_PPC_2_03 |
+                                OV1_PPC_2_04 | OV1_PPC_2_05 | OV1_PPC_2_06 | OV1_PPC_2_07,
+       },
+
+       .vec2_len = VECTOR_LENGTH(sizeof(struct option_vector2)),
        /* option vector 2: Open Firmware options supported */
-       VECTOR_LENGTH(33),              /* length */
-       OV2_REAL_MODE,
-       0, 0,
-       W(0xffffffff),                  /* real_base */
-       W(0xffffffff),                  /* real_size */
-       W(0xffffffff),                  /* virt_base */
-       W(0xffffffff),                  /* virt_size */
-       W(0xffffffff),                  /* load_base */
-       W(256),                         /* 256MB min RMA */
-       W(0xffffffff),                  /* full client load */
-       0,                              /* min RMA percentage of total RAM */
-       48,                             /* max log_2(hash table size) */
+       .vec2 = {
+               .byte1 = OV2_REAL_MODE,
+               .reserved = 0,
+               .real_base = cpu_to_be32(0xffffffff),
+               .real_size = cpu_to_be32(0xffffffff),
+               .virt_base = cpu_to_be32(0xffffffff),
+               .virt_size = cpu_to_be32(0xffffffff),
+               .load_base = cpu_to_be32(0xffffffff),
+               .min_rma = cpu_to_be32(256),            /* 256MB min RMA */
+               .min_load = cpu_to_be32(0xffffffff),    /* full client load */
+               .min_rma_percent = 0,   /* min RMA percentage of total RAM */
+               .max_pft_size = 48,     /* max log_2(hash table size) */
+       },
 
+       .vec3_len = VECTOR_LENGTH(sizeof(struct option_vector3)),
        /* option vector 3: processor options supported */
-       VECTOR_LENGTH(2),               /* length */
-       0,                              /* don't ignore, don't halt */
-       OV3_FP | OV3_VMX | OV3_DFP,
+       .vec3 = {
+               .byte1 = 0,                     /* don't ignore, don't halt */
+               .byte2 = OV3_FP | OV3_VMX | OV3_DFP,
+       },
 
+       .vec4_len = VECTOR_LENGTH(sizeof(struct option_vector4)),
        /* option vector 4: IBM PAPR implementation */
-       VECTOR_LENGTH(2),               /* length */
-       0,                              /* don't halt */
-       OV4_MIN_ENT_CAP,                /* minimum VP entitled capacity */
+       .vec4 = {
+               .byte1 = 0,                     /* don't halt */
+               .min_vp_cap = OV4_MIN_ENT_CAP,  /* minimum VP entitled capacity */
+       },
 
+       .vec5_len = VECTOR_LENGTH(sizeof(struct option_vector5)),
        /* option vector 5: PAPR/OF options */
-       VECTOR_LENGTH(21),              /* length */
-       0,                              /* don't ignore, don't halt */
-       OV5_FEAT(OV5_LPAR) | OV5_FEAT(OV5_SPLPAR) | OV5_FEAT(OV5_LARGE_PAGES) |
-       OV5_FEAT(OV5_DRCONF_MEMORY) | OV5_FEAT(OV5_DONATE_DEDICATE_CPU) |
+       .vec5 = {
+               .byte1 = 0,                             /* don't ignore, don't halt */
+               .byte2 = OV5_FEAT(OV5_LPAR) | OV5_FEAT(OV5_SPLPAR) | OV5_FEAT(OV5_LARGE_PAGES) |
+               OV5_FEAT(OV5_DRCONF_MEMORY) | OV5_FEAT(OV5_DONATE_DEDICATE_CPU) |
 #ifdef CONFIG_PCI_MSI
-       /* PCIe/MSI support.  Without MSI full PCIe is not supported */
-       OV5_FEAT(OV5_MSI),
+               /* PCIe/MSI support.  Without MSI full PCIe is not supported */
+               OV5_FEAT(OV5_MSI),
 #else
-       0,
+               0,
 #endif
-       0,
+               .byte3 = 0,
+               .cmo =
 #ifdef CONFIG_PPC_SMLPAR
-       OV5_FEAT(OV5_CMO) | OV5_FEAT(OV5_XCMO),
+               OV5_FEAT(OV5_CMO) | OV5_FEAT(OV5_XCMO),
 #else
-       0,
+               0,
 #endif
-       OV5_FEAT(OV5_TYPE1_AFFINITY) | OV5_FEAT(OV5_PRRN),
-       0,
-       0,
-       0,
-       /* WARNING: The offset of the "number of cores" field below
-        * must match by the macro below. Update the definition if
-        * the structure layout changes.
-        */
-#define IBM_ARCH_VEC_NRCORES_OFFSET    133
-       W(NR_CPUS),                     /* number of cores supported */
-       0,
-       0,
-       0,
-       0,
-       OV5_FEAT(OV5_PFO_HW_RNG) | OV5_FEAT(OV5_PFO_HW_ENCR) |
-       OV5_FEAT(OV5_PFO_HW_842),                               /* Byte 17 */
-       0,                                                      /* Byte 18 */
-       0,                                                      /* Byte 19 */
-       0,                                                      /* Byte 20 */
-       OV5_FEAT(OV5_SUB_PROCESSORS),                           /* Byte 21 */
+               .associativity = OV5_FEAT(OV5_TYPE1_AFFINITY) | OV5_FEAT(OV5_PRRN),
+               .bin_opts = 0,
+               .micro_checkpoint = 0,
+               .reserved0 = 0,
+               .max_cpus = cpu_to_be32(NR_CPUS),       /* number of cores supported */
+               .papr_level = 0,
+               .reserved1 = 0,
+               .platform_facilities = OV5_FEAT(OV5_PFO_HW_RNG) | OV5_FEAT(OV5_PFO_HW_ENCR) | OV5_FEAT(OV5_PFO_HW_842),
+               .reserved2 = 0,
+               .reserved3 = 0,
+               .subprocessors = 1,
+       },
 
        /* option vector 6: IBM PAPR hints */
-       VECTOR_LENGTH(3),               /* length */
-       0,
-       0,
-       OV6_LINUX,
+       .vec6_len = VECTOR_LENGTH(sizeof(struct option_vector6)),
+       .vec6 = {
+               .reserved = 0,
+               .secondary_pteg = 0,
+               .os_name = OV6_LINUX,
+       },
 };
 
 /* Old method - ELF header with PT_NOTE sections only works on BE */
@@ -873,7 +980,6 @@ static void __init prom_send_capabilities(void)
        ihandle root;
        prom_arg_t ret;
        u32 cores;
-       unsigned char *ptcores;
 
        root = call_prom("open", 1, 1, ADDR("/"));
        if (root != 0) {
@@ -884,37 +990,18 @@ static void __init prom_send_capabilities(void)
                 * divide NR_CPUS.
                 */
 
-               /* The core value may start at an odd address. If such a word
-                * access is made at a cache line boundary, this leads to an
-                * exception which may not be handled at this time.
-                * Forcing a per byte access to avoid exception.
-                */
-               ptcores = &ibm_architecture_vec[IBM_ARCH_VEC_NRCORES_OFFSET];
-               cores = 0;
-               cores |= ptcores[0] << 24;
-               cores |= ptcores[1] << 16;
-               cores |= ptcores[2] << 8;
-               cores |= ptcores[3];
-               if (cores != NR_CPUS) {
-                       prom_printf("WARNING ! "
-                                   "ibm_architecture_vec structure inconsistent: %lu!\n",
-                                   cores);
-               } else {
-                       cores = DIV_ROUND_UP(NR_CPUS, prom_count_smt_threads());
-                       prom_printf("Max number of cores passed to firmware: %lu (NR_CPUS = %lu)\n",
-                                   cores, NR_CPUS);
-                       ptcores[0] = (cores >> 24) & 0xff;
-                       ptcores[1] = (cores >> 16) & 0xff;
-                       ptcores[2] = (cores >> 8) & 0xff;
-                       ptcores[3] = cores & 0xff;
-               }
+               cores = DIV_ROUND_UP(NR_CPUS, prom_count_smt_threads());
+               prom_printf("Max number of cores passed to firmware: %lu (NR_CPUS = %lu)\n",
+                           cores, NR_CPUS);
+
+               ibm_architecture_vec.vec5.max_cpus = cpu_to_be32(cores);
 
                /* try calling the ibm,client-architecture-support method */
                prom_printf("Calling ibm,client-architecture-support...");
                if (call_prom_ret("call-method", 3, 2, &ret,
                                  ADDR("ibm,client-architecture-support"),
                                  root,
-                                 ADDR(ibm_architecture_vec)) == 0) {
+                                 ADDR(&ibm_architecture_vec)) == 0) {
                        /* the call exists... */
                        if (ret)
                                prom_printf("\nWARNING: ibm,client-architecture"
index 270ee30abdcf739982438271d8b5957e7e830dbb..f516ac508ae33e7a8d73ab6c9d392ed44a2a1c64 100644 (file)
@@ -915,7 +915,7 @@ void __init setup_arch(char **cmdline_p)
        init_mm.context.pte_frag = NULL;
 #endif
 #ifdef CONFIG_SPAPR_TCE_IOMMU
-       mm_iommu_init(&init_mm.context);
+       mm_iommu_init(&init_mm);
 #endif
        irqstack_early_init();
        exc_lvl_early_init();
index 7ac8e6eaab5ba24566f1f6fe06829e22727e86ea..c3e129080c31d77aa245c6ad5d377d44b8cd0424 100644 (file)
@@ -346,7 +346,7 @@ void early_setup_secondary(void)
 
 #endif /* CONFIG_SMP */
 
-#if defined(CONFIG_SMP) || defined(CONFIG_KEXEC)
+#if defined(CONFIG_SMP) || defined(CONFIG_KEXEC_CORE)
 static bool use_spinloop(void)
 {
        if (!IS_ENABLED(CONFIG_PPC_BOOK3E))
@@ -391,7 +391,7 @@ void smp_release_cpus(void)
 
        DBG(" <- smp_release_cpus()\n");
 }
-#endif /* CONFIG_SMP || CONFIG_KEXEC */
+#endif /* CONFIG_SMP || CONFIG_KEXEC_CORE */
 
 /*
  * Initialize some remaining members of the ppc64_caches and systemcfg
index 9c6f3fd580597e5fdfcc5fc46031d9be0aaac74b..893bd7f79be682decd077a3e05a42e1bc7520452 100644 (file)
@@ -193,7 +193,7 @@ int smp_request_message_ipi(int virq, int msg)
        if (msg < 0 || msg > PPC_MSG_DEBUGGER_BREAK) {
                return -EINVAL;
        }
-#if !defined(CONFIG_DEBUGGER) && !defined(CONFIG_KEXEC)
+#if !defined(CONFIG_DEBUGGER) && !defined(CONFIG_KEXEC_CORE)
        if (msg == PPC_MSG_DEBUGGER_BREAK) {
                return 1;
        }
@@ -325,7 +325,7 @@ void tick_broadcast(const struct cpumask *mask)
 }
 #endif
 
-#if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC)
+#if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC_CORE)
 void smp_send_debugger_break(void)
 {
        int cpu;
@@ -340,7 +340,7 @@ void smp_send_debugger_break(void)
 }
 #endif
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *))
 {
        crash_ipi_function_ptr = crash_ipi_callback;
index 91d278c9ab28dc46de13e7a7bb6e0da357dce5e2..4239aaf748866a666680792437059c3ad356a0eb 100644 (file)
@@ -64,8 +64,9 @@
 #include <asm/asm-prototypes.h>
 #include <asm/hmi.h>
 #include <sysdev/fsl_pci.h>
+#include <asm/kprobes.h>
 
-#if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC)
+#if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC_CORE)
 int (*__debugger)(struct pt_regs *regs) __read_mostly;
 int (*__debugger_ipi)(struct pt_regs *regs) __read_mostly;
 int (*__debugger_bpt)(struct pt_regs *regs) __read_mostly;
@@ -122,9 +123,6 @@ static unsigned long oops_begin(struct pt_regs *regs)
        int cpu;
        unsigned long flags;
 
-       if (debugger(regs))
-               return 1;
-
        oops_enter();
 
        /* racy, but better than risking deadlock. */
@@ -150,14 +148,15 @@ static void oops_end(unsigned long flags, struct pt_regs *regs,
                               int signr)
 {
        bust_spinlocks(0);
-       die_owner = -1;
        add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
        die_nest_count--;
        oops_exit();
        printk("\n");
-       if (!die_nest_count)
+       if (!die_nest_count) {
                /* Nest count reaches zero, release the lock. */
+               die_owner = -1;
                arch_spin_unlock(&die_lock);
+       }
        raw_local_irq_restore(flags);
 
        crash_fadump(regs, "die oops");
@@ -227,8 +226,12 @@ NOKPROBE_SYMBOL(__die);
 
 void die(const char *str, struct pt_regs *regs, long err)
 {
-       unsigned long flags = oops_begin(regs);
+       unsigned long flags;
+
+       if (debugger(regs))
+               return;
 
+       flags = oops_begin(regs);
        if (__die(str, regs, err))
                err = 0;
        oops_end(flags, regs, err);
@@ -824,6 +827,9 @@ void single_step_exception(struct pt_regs *regs)
 
        clear_single_step(regs);
 
+       if (kprobe_post_handler(regs))
+               return;
+
        if (notify_die(DIE_SSTEP, "single_step", regs, 5,
                                        5, SIGTRAP) == NOTIFY_STOP)
                goto bail;
@@ -1177,6 +1183,9 @@ void program_check_exception(struct pt_regs *regs)
                if (debugger_bpt(regs))
                        goto bail;
 
+               if (kprobe_handler(regs))
+                       goto bail;
+
                /* trap exception */
                if (notify_die(DIE_BPT, "breakpoint", regs, 5, 5, SIGTRAP)
                                == NOTIFY_STOP)
@@ -1510,7 +1519,8 @@ void facility_unavailable_exception(struct pt_regs *regs)
                return;
        }
 
-       if ((status < ARRAY_SIZE(facility_strings)) &&
+       if ((hv || status >= 2) &&
+           (status < ARRAY_SIZE(facility_strings)) &&
            facility_strings[status])
                facility = facility_strings[status];
 
@@ -1518,9 +1528,8 @@ void facility_unavailable_exception(struct pt_regs *regs)
        if (!arch_irq_disabled_regs(regs))
                local_irq_enable();
 
-       pr_err_ratelimited(
-               "%sFacility '%s' unavailable, exception at 0x%lx, MSR=%lx\n",
-               hv ? "Hypervisor " : "", facility, regs->nip, regs->msr);
+       pr_err_ratelimited("%sFacility '%s' unavailable (%d), exception at 0x%lx, MSR=%lx\n",
+               hv ? "Hypervisor " : "", facility, status, regs->nip, regs->msr);
 
 out:
        if (user_mode(regs)) {
@@ -1745,6 +1754,9 @@ void DebugException(struct pt_regs *regs, unsigned long debug_status)
                        return;
                }
 
+               if (kprobe_post_handler(regs))
+                       return;
+
                if (notify_die(DIE_SSTEP, "block_step", regs, 5,
                               5, SIGTRAP) == NOTIFY_STOP) {
                        return;
@@ -1759,6 +1771,9 @@ void DebugException(struct pt_regs *regs, unsigned long debug_status)
                /* Clear the instruction completion event */
                mtspr(SPRN_DBSR, DBSR_IC);
 
+               if (kprobe_post_handler(regs))
+                       return;
+
                if (notify_die(DIE_SSTEP, "single_step", regs, 5,
                               5, SIGTRAP) == NOTIFY_STOP) {
                        return;
diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
deleted file mode 100644 (file)
index 2c8fb3e..0000000
+++ /dev/null
@@ -1,1705 +0,0 @@
-/*
- * IBM PowerPC Virtual I/O Infrastructure Support.
- *
- *    Copyright (c) 2003,2008 IBM Corp.
- *     Dave Engebretsen engebret@us.ibm.com
- *     Santiago Leon santil@us.ibm.com
- *     Hollis Blanchard <hollisb@us.ibm.com>
- *     Stephen Rothwell
- *     Robert Jennings <rcjenn@us.ibm.com>
- *
- *      This program is free software; you can redistribute it and/or
- *      modify it under the terms of the GNU General Public License
- *      as published by the Free Software Foundation; either version
- *      2 of the License, or (at your option) any later version.
- */
-
-#include <linux/cpu.h>
-#include <linux/types.h>
-#include <linux/delay.h>
-#include <linux/stat.h>
-#include <linux/device.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/console.h>
-#include <linux/export.h>
-#include <linux/mm.h>
-#include <linux/dma-mapping.h>
-#include <linux/kobject.h>
-
-#include <asm/iommu.h>
-#include <asm/dma.h>
-#include <asm/vio.h>
-#include <asm/prom.h>
-#include <asm/firmware.h>
-#include <asm/tce.h>
-#include <asm/page.h>
-#include <asm/hvcall.h>
-
-static struct vio_dev vio_bus_device  = { /* fake "parent" device */
-       .name = "vio",
-       .type = "",
-       .dev.init_name = "vio",
-       .dev.bus = &vio_bus_type,
-};
-
-#ifdef CONFIG_PPC_SMLPAR
-/**
- * vio_cmo_pool - A pool of IO memory for CMO use
- *
- * @size: The size of the pool in bytes
- * @free: The amount of free memory in the pool
- */
-struct vio_cmo_pool {
-       size_t size;
-       size_t free;
-};
-
-/* How many ms to delay queued balance work */
-#define VIO_CMO_BALANCE_DELAY 100
-
-/* Portion out IO memory to CMO devices by this chunk size */
-#define VIO_CMO_BALANCE_CHUNK 131072
-
-/**
- * vio_cmo_dev_entry - A device that is CMO-enabled and requires entitlement
- *
- * @vio_dev: struct vio_dev pointer
- * @list: pointer to other devices on bus that are being tracked
- */
-struct vio_cmo_dev_entry {
-       struct vio_dev *viodev;
-       struct list_head list;
-};
-
-/**
- * vio_cmo - VIO bus accounting structure for CMO entitlement
- *
- * @lock: spinlock for entire structure
- * @balance_q: work queue for balancing system entitlement
- * @device_list: list of CMO-enabled devices requiring entitlement
- * @entitled: total system entitlement in bytes
- * @reserve: pool of memory from which devices reserve entitlement, incl. spare
- * @excess: pool of excess entitlement not needed for device reserves or spare
- * @spare: IO memory for device hotplug functionality
- * @min: minimum necessary for system operation
- * @desired: desired memory for system operation
- * @curr: bytes currently allocated
- * @high: high water mark for IO data usage
- */
-static struct vio_cmo {
-       spinlock_t lock;
-       struct delayed_work balance_q;
-       struct list_head device_list;
-       size_t entitled;
-       struct vio_cmo_pool reserve;
-       struct vio_cmo_pool excess;
-       size_t spare;
-       size_t min;
-       size_t desired;
-       size_t curr;
-       size_t high;
-} vio_cmo;
-
-/**
- * vio_cmo_OF_devices - Count the number of OF devices that have DMA windows
- */
-static int vio_cmo_num_OF_devs(void)
-{
-       struct device_node *node_vroot;
-       int count = 0;
-
-       /*
-        * Count the number of vdevice entries with an
-        * ibm,my-dma-window OF property
-        */
-       node_vroot = of_find_node_by_name(NULL, "vdevice");
-       if (node_vroot) {
-               struct device_node *of_node;
-               struct property *prop;
-
-               for_each_child_of_node(node_vroot, of_node) {
-                       prop = of_find_property(of_node, "ibm,my-dma-window",
-                                              NULL);
-                       if (prop)
-                               count++;
-               }
-       }
-       of_node_put(node_vroot);
-       return count;
-}
-
-/**
- * vio_cmo_alloc - allocate IO memory for CMO-enable devices
- *
- * @viodev: VIO device requesting IO memory
- * @size: size of allocation requested
- *
- * Allocations come from memory reserved for the devices and any excess
- * IO memory available to all devices.  The spare pool used to service
- * hotplug must be equal to %VIO_CMO_MIN_ENT for the excess pool to be
- * made available.
- *
- * Return codes:
- *  0 for successful allocation and -ENOMEM for a failure
- */
-static inline int vio_cmo_alloc(struct vio_dev *viodev, size_t size)
-{
-       unsigned long flags;
-       size_t reserve_free = 0;
-       size_t excess_free = 0;
-       int ret = -ENOMEM;
-
-       spin_lock_irqsave(&vio_cmo.lock, flags);
-
-       /* Determine the amount of free entitlement available in reserve */
-       if (viodev->cmo.entitled > viodev->cmo.allocated)
-               reserve_free = viodev->cmo.entitled - viodev->cmo.allocated;
-
-       /* If spare is not fulfilled, the excess pool can not be used. */
-       if (vio_cmo.spare >= VIO_CMO_MIN_ENT)
-               excess_free = vio_cmo.excess.free;
-
-       /* The request can be satisfied */
-       if ((reserve_free + excess_free) >= size) {
-               vio_cmo.curr += size;
-               if (vio_cmo.curr > vio_cmo.high)
-                       vio_cmo.high = vio_cmo.curr;
-               viodev->cmo.allocated += size;
-               size -= min(reserve_free, size);
-               vio_cmo.excess.free -= size;
-               ret = 0;
-       }
-
-       spin_unlock_irqrestore(&vio_cmo.lock, flags);
-       return ret;
-}
-
-/**
- * vio_cmo_dealloc - deallocate IO memory from CMO-enable devices
- * @viodev: VIO device freeing IO memory
- * @size: size of deallocation
- *
- * IO memory is freed by the device back to the correct memory pools.
- * The spare pool is replenished first from either memory pool, then
- * the reserve pool is used to reduce device entitlement, the excess
- * pool is used to increase the reserve pool toward the desired entitlement
- * target, and then the remaining memory is returned to the pools.
- *
- */
-static inline void vio_cmo_dealloc(struct vio_dev *viodev, size_t size)
-{
-       unsigned long flags;
-       size_t spare_needed = 0;
-       size_t excess_freed = 0;
-       size_t reserve_freed = size;
-       size_t tmp;
-       int balance = 0;
-
-       spin_lock_irqsave(&vio_cmo.lock, flags);
-       vio_cmo.curr -= size;
-
-       /* Amount of memory freed from the excess pool */
-       if (viodev->cmo.allocated > viodev->cmo.entitled) {
-               excess_freed = min(reserve_freed, (viodev->cmo.allocated -
-                                                  viodev->cmo.entitled));
-               reserve_freed -= excess_freed;
-       }
-
-       /* Remove allocation from device */
-       viodev->cmo.allocated -= (reserve_freed + excess_freed);
-
-       /* Spare is a subset of the reserve pool, replenish it first. */
-       spare_needed = VIO_CMO_MIN_ENT - vio_cmo.spare;
-
-       /*
-        * Replenish the spare in the reserve pool from the excess pool.
-        * This moves entitlement into the reserve pool.
-        */
-       if (spare_needed && excess_freed) {
-               tmp = min(excess_freed, spare_needed);
-               vio_cmo.excess.size -= tmp;
-               vio_cmo.reserve.size += tmp;
-               vio_cmo.spare += tmp;
-               excess_freed -= tmp;
-               spare_needed -= tmp;
-               balance = 1;
-       }
-
-       /*
-        * Replenish the spare in the reserve pool from the reserve pool.
-        * This removes entitlement from the device down to VIO_CMO_MIN_ENT,
-        * if needed, and gives it to the spare pool. The amount of used
-        * memory in this pool does not change.
-        */
-       if (spare_needed && reserve_freed) {
-               tmp = min3(spare_needed, reserve_freed, (viodev->cmo.entitled - VIO_CMO_MIN_ENT));
-
-               vio_cmo.spare += tmp;
-               viodev->cmo.entitled -= tmp;
-               reserve_freed -= tmp;
-               spare_needed -= tmp;
-               balance = 1;
-       }
-
-       /*
-        * Increase the reserve pool until the desired allocation is met.
-        * Move an allocation freed from the excess pool into the reserve
-        * pool and schedule a balance operation.
-        */
-       if (excess_freed && (vio_cmo.desired > vio_cmo.reserve.size)) {
-               tmp = min(excess_freed, (vio_cmo.desired - vio_cmo.reserve.size));
-
-               vio_cmo.excess.size -= tmp;
-               vio_cmo.reserve.size += tmp;
-               excess_freed -= tmp;
-               balance = 1;
-       }
-
-       /* Return memory from the excess pool to that pool */
-       if (excess_freed)
-               vio_cmo.excess.free += excess_freed;
-
-       if (balance)
-               schedule_delayed_work(&vio_cmo.balance_q, VIO_CMO_BALANCE_DELAY);
-       spin_unlock_irqrestore(&vio_cmo.lock, flags);
-}
-
-/**
- * vio_cmo_entitlement_update - Manage system entitlement changes
- *
- * @new_entitlement: new system entitlement to attempt to accommodate
- *
- * Increases in entitlement will be used to fulfill the spare entitlement
- * and the rest is given to the excess pool.  Decreases, if they are
- * possible, come from the excess pool and from unused device entitlement
- *
- * Returns: 0 on success, -ENOMEM when change can not be made
- */
-int vio_cmo_entitlement_update(size_t new_entitlement)
-{
-       struct vio_dev *viodev;
-       struct vio_cmo_dev_entry *dev_ent;
-       unsigned long flags;
-       size_t avail, delta, tmp;
-
-       spin_lock_irqsave(&vio_cmo.lock, flags);
-
-       /* Entitlement increases */
-       if (new_entitlement > vio_cmo.entitled) {
-               delta = new_entitlement - vio_cmo.entitled;
-
-               /* Fulfill spare allocation */
-               if (vio_cmo.spare < VIO_CMO_MIN_ENT) {
-                       tmp = min(delta, (VIO_CMO_MIN_ENT - vio_cmo.spare));
-                       vio_cmo.spare += tmp;
-                       vio_cmo.reserve.size += tmp;
-                       delta -= tmp;
-               }
-
-               /* Remaining new allocation goes to the excess pool */
-               vio_cmo.entitled += delta;
-               vio_cmo.excess.size += delta;
-               vio_cmo.excess.free += delta;
-
-               goto out;
-       }
-
-       /* Entitlement decreases */
-       delta = vio_cmo.entitled - new_entitlement;
-       avail = vio_cmo.excess.free;
-
-       /*
-        * Need to check how much unused entitlement each device can
-        * sacrifice to fulfill entitlement change.
-        */
-       list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
-               if (avail >= delta)
-                       break;
-
-               viodev = dev_ent->viodev;
-               if ((viodev->cmo.entitled > viodev->cmo.allocated) &&
-                   (viodev->cmo.entitled > VIO_CMO_MIN_ENT))
-                               avail += viodev->cmo.entitled -
-                                        max_t(size_t, viodev->cmo.allocated,
-                                              VIO_CMO_MIN_ENT);
-       }
-
-       if (delta <= avail) {
-               vio_cmo.entitled -= delta;
-
-               /* Take entitlement from the excess pool first */
-               tmp = min(vio_cmo.excess.free, delta);
-               vio_cmo.excess.size -= tmp;
-               vio_cmo.excess.free -= tmp;
-               delta -= tmp;
-
-               /*
-                * Remove all but VIO_CMO_MIN_ENT bytes from devices
-                * until entitlement change is served
-                */
-               list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
-                       if (!delta)
-                               break;
-
-                       viodev = dev_ent->viodev;
-                       tmp = 0;
-                       if ((viodev->cmo.entitled > viodev->cmo.allocated) &&
-                           (viodev->cmo.entitled > VIO_CMO_MIN_ENT))
-                               tmp = viodev->cmo.entitled -
-                                     max_t(size_t, viodev->cmo.allocated,
-                                           VIO_CMO_MIN_ENT);
-                       viodev->cmo.entitled -= min(tmp, delta);
-                       delta -= min(tmp, delta);
-               }
-       } else {
-               spin_unlock_irqrestore(&vio_cmo.lock, flags);
-               return -ENOMEM;
-       }
-
-out:
-       schedule_delayed_work(&vio_cmo.balance_q, 0);
-       spin_unlock_irqrestore(&vio_cmo.lock, flags);
-       return 0;
-}
-
-/**
- * vio_cmo_balance - Balance entitlement among devices
- *
- * @work: work queue structure for this operation
- *
- * Any system entitlement above the minimum needed for devices, or
- * already allocated to devices, can be distributed to the devices.
- * The list of devices is iterated through to recalculate the desired
- * entitlement level and to determine how much entitlement above the
- * minimum entitlement is allocated to devices.
- *
- * Small chunks of the available entitlement are given to devices until
- * their requirements are fulfilled or there is no entitlement left to give.
- * Upon completion sizes of the reserve and excess pools are calculated.
- *
- * The system minimum entitlement level is also recalculated here.
- * Entitlement will be reserved for devices even after vio_bus_remove to
- * accommodate reloading the driver.  The OF tree is walked to count the
- * number of devices present and this will remove entitlement for devices
- * that have actually left the system after having vio_bus_remove called.
- */
-static void vio_cmo_balance(struct work_struct *work)
-{
-       struct vio_cmo *cmo;
-       struct vio_dev *viodev;
-       struct vio_cmo_dev_entry *dev_ent;
-       unsigned long flags;
-       size_t avail = 0, level, chunk, need;
-       int devcount = 0, fulfilled;
-
-       cmo = container_of(work, struct vio_cmo, balance_q.work);
-
-       spin_lock_irqsave(&vio_cmo.lock, flags);
-
-       /* Calculate minimum entitlement and fulfill spare */
-       cmo->min = vio_cmo_num_OF_devs() * VIO_CMO_MIN_ENT;
-       BUG_ON(cmo->min > cmo->entitled);
-       cmo->spare = min_t(size_t, VIO_CMO_MIN_ENT, (cmo->entitled - cmo->min));
-       cmo->min += cmo->spare;
-       cmo->desired = cmo->min;
-
-       /*
-        * Determine how much entitlement is available and reset device
-        * entitlements
-        */
-       avail = cmo->entitled - cmo->spare;
-       list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
-               viodev = dev_ent->viodev;
-               devcount++;
-               viodev->cmo.entitled = VIO_CMO_MIN_ENT;
-               cmo->desired += (viodev->cmo.desired - VIO_CMO_MIN_ENT);
-               avail -= max_t(size_t, viodev->cmo.allocated, VIO_CMO_MIN_ENT);
-       }
-
-       /*
-        * Having provided each device with the minimum entitlement, loop
-        * over the devices portioning out the remaining entitlement
-        * until there is nothing left.
-        */
-       level = VIO_CMO_MIN_ENT;
-       while (avail) {
-               fulfilled = 0;
-               list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
-                       viodev = dev_ent->viodev;
-
-                       if (viodev->cmo.desired <= level) {
-                               fulfilled++;
-                               continue;
-                       }
-
-                       /*
-                        * Give the device up to VIO_CMO_BALANCE_CHUNK
-                        * bytes of entitlement, but do not exceed the
-                        * desired level of entitlement for the device.
-                        */
-                       chunk = min_t(size_t, avail, VIO_CMO_BALANCE_CHUNK);
-                       chunk = min(chunk, (viodev->cmo.desired -
-                                           viodev->cmo.entitled));
-                       viodev->cmo.entitled += chunk;
-
-                       /*
-                        * If the memory for this entitlement increase was
-                        * already allocated to the device it does not come
-                        * from the available pool being portioned out.
-                        */
-                       need = max(viodev->cmo.allocated, viodev->cmo.entitled)-
-                              max(viodev->cmo.allocated, level);
-                       avail -= need;
-
-               }
-               if (fulfilled == devcount)
-                       break;
-               level += VIO_CMO_BALANCE_CHUNK;
-       }
-
-       /* Calculate new reserve and excess pool sizes */
-       cmo->reserve.size = cmo->min;
-       cmo->excess.free = 0;
-       cmo->excess.size = 0;
-       need = 0;
-       list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
-               viodev = dev_ent->viodev;
-               /* Calculated reserve size above the minimum entitlement */
-               if (viodev->cmo.entitled)
-                       cmo->reserve.size += (viodev->cmo.entitled -
-                                             VIO_CMO_MIN_ENT);
-               /* Calculated used excess entitlement */
-               if (viodev->cmo.allocated > viodev->cmo.entitled)
-                       need += viodev->cmo.allocated - viodev->cmo.entitled;
-       }
-       cmo->excess.size = cmo->entitled - cmo->reserve.size;
-       cmo->excess.free = cmo->excess.size - need;
-
-       cancel_delayed_work(to_delayed_work(work));
-       spin_unlock_irqrestore(&vio_cmo.lock, flags);
-}
-
-static void *vio_dma_iommu_alloc_coherent(struct device *dev, size_t size,
-                                         dma_addr_t *dma_handle, gfp_t flag,
-                                         unsigned long attrs)
-{
-       struct vio_dev *viodev = to_vio_dev(dev);
-       void *ret;
-
-       if (vio_cmo_alloc(viodev, roundup(size, PAGE_SIZE))) {
-               atomic_inc(&viodev->cmo.allocs_failed);
-               return NULL;
-       }
-
-       ret = dma_iommu_ops.alloc(dev, size, dma_handle, flag, attrs);
-       if (unlikely(ret == NULL)) {
-               vio_cmo_dealloc(viodev, roundup(size, PAGE_SIZE));
-               atomic_inc(&viodev->cmo.allocs_failed);
-       }
-
-       return ret;
-}
-
-static void vio_dma_iommu_free_coherent(struct device *dev, size_t size,
-                                       void *vaddr, dma_addr_t dma_handle,
-                                       unsigned long attrs)
-{
-       struct vio_dev *viodev = to_vio_dev(dev);
-
-       dma_iommu_ops.free(dev, size, vaddr, dma_handle, attrs);
-
-       vio_cmo_dealloc(viodev, roundup(size, PAGE_SIZE));
-}
-
-static dma_addr_t vio_dma_iommu_map_page(struct device *dev, struct page *page,
-                                         unsigned long offset, size_t size,
-                                         enum dma_data_direction direction,
-                                         unsigned long attrs)
-{
-       struct vio_dev *viodev = to_vio_dev(dev);
-       struct iommu_table *tbl;
-       dma_addr_t ret = DMA_ERROR_CODE;
-
-       tbl = get_iommu_table_base(dev);
-       if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)))) {
-               atomic_inc(&viodev->cmo.allocs_failed);
-               return ret;
-       }
-
-       ret = dma_iommu_ops.map_page(dev, page, offset, size, direction, attrs);
-       if (unlikely(dma_mapping_error(dev, ret))) {
-               vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)));
-               atomic_inc(&viodev->cmo.allocs_failed);
-       }
-
-       return ret;
-}
-
-static void vio_dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle,
-                                    size_t size,
-                                    enum dma_data_direction direction,
-                                    unsigned long attrs)
-{
-       struct vio_dev *viodev = to_vio_dev(dev);
-       struct iommu_table *tbl;
-
-       tbl = get_iommu_table_base(dev);
-       dma_iommu_ops.unmap_page(dev, dma_handle, size, direction, attrs);
-
-       vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)));
-}
-
-static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
-                                int nelems, enum dma_data_direction direction,
-                                unsigned long attrs)
-{
-       struct vio_dev *viodev = to_vio_dev(dev);
-       struct iommu_table *tbl;
-       struct scatterlist *sgl;
-       int ret, count;
-       size_t alloc_size = 0;
-
-       tbl = get_iommu_table_base(dev);
-       for_each_sg(sglist, sgl, nelems, count)
-               alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE(tbl));
-
-       if (vio_cmo_alloc(viodev, alloc_size)) {
-               atomic_inc(&viodev->cmo.allocs_failed);
-               return 0;
-       }
-
-       ret = dma_iommu_ops.map_sg(dev, sglist, nelems, direction, attrs);
-
-       if (unlikely(!ret)) {
-               vio_cmo_dealloc(viodev, alloc_size);
-               atomic_inc(&viodev->cmo.allocs_failed);
-               return ret;
-       }
-
-       for_each_sg(sglist, sgl, ret, count)
-               alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl));
-       if (alloc_size)
-               vio_cmo_dealloc(viodev, alloc_size);
-
-       return ret;
-}
-
-static void vio_dma_iommu_unmap_sg(struct device *dev,
-               struct scatterlist *sglist, int nelems,
-               enum dma_data_direction direction,
-               unsigned long attrs)
-{
-       struct vio_dev *viodev = to_vio_dev(dev);
-       struct iommu_table *tbl;
-       struct scatterlist *sgl;
-       size_t alloc_size = 0;
-       int count;
-
-       tbl = get_iommu_table_base(dev);
-       for_each_sg(sglist, sgl, nelems, count)
-               alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl));
-
-       dma_iommu_ops.unmap_sg(dev, sglist, nelems, direction, attrs);
-
-       vio_cmo_dealloc(viodev, alloc_size);
-}
-
-static int vio_dma_iommu_dma_supported(struct device *dev, u64 mask)
-{
-        return dma_iommu_ops.dma_supported(dev, mask);
-}
-
-static u64 vio_dma_get_required_mask(struct device *dev)
-{
-        return dma_iommu_ops.get_required_mask(dev);
-}
-
-static struct dma_map_ops vio_dma_mapping_ops = {
-       .alloc             = vio_dma_iommu_alloc_coherent,
-       .free              = vio_dma_iommu_free_coherent,
-       .mmap              = dma_direct_mmap_coherent,
-       .map_sg            = vio_dma_iommu_map_sg,
-       .unmap_sg          = vio_dma_iommu_unmap_sg,
-       .map_page          = vio_dma_iommu_map_page,
-       .unmap_page        = vio_dma_iommu_unmap_page,
-       .dma_supported     = vio_dma_iommu_dma_supported,
-       .get_required_mask = vio_dma_get_required_mask,
-};
-
-/**
- * vio_cmo_set_dev_desired - Set desired entitlement for a device
- *
- * @viodev: struct vio_dev for device to alter
- * @desired: new desired entitlement level in bytes
- *
- * For use by devices to request a change to their entitlement at runtime or
- * through sysfs.  The desired entitlement level is changed and a balancing
- * of system resources is scheduled to run in the future.
- */
-void vio_cmo_set_dev_desired(struct vio_dev *viodev, size_t desired)
-{
-       unsigned long flags;
-       struct vio_cmo_dev_entry *dev_ent;
-       int found = 0;
-
-       if (!firmware_has_feature(FW_FEATURE_CMO))
-               return;
-
-       spin_lock_irqsave(&vio_cmo.lock, flags);
-       if (desired < VIO_CMO_MIN_ENT)
-               desired = VIO_CMO_MIN_ENT;
-
-       /*
-        * Changes will not be made for devices not in the device list.
-        * If it is not in the device list, then no driver is loaded
-        * for the device and it can not receive entitlement.
-        */
-       list_for_each_entry(dev_ent, &vio_cmo.device_list, list)
-               if (viodev == dev_ent->viodev) {
-                       found = 1;
-                       break;
-               }
-       if (!found) {
-               spin_unlock_irqrestore(&vio_cmo.lock, flags);
-               return;
-       }
-
-       /* Increase/decrease in desired device entitlement */
-       if (desired >= viodev->cmo.desired) {
-               /* Just bump the bus and device values prior to a balance*/
-               vio_cmo.desired += desired - viodev->cmo.desired;
-               viodev->cmo.desired = desired;
-       } else {
-               /* Decrease bus and device values for desired entitlement */
-               vio_cmo.desired -= viodev->cmo.desired - desired;
-               viodev->cmo.desired = desired;
-               /*
-                * If less entitlement is desired than current entitlement, move
-                * any reserve memory in the change region to the excess pool.
-                */
-               if (viodev->cmo.entitled > desired) {
-                       vio_cmo.reserve.size -= viodev->cmo.entitled - desired;
-                       vio_cmo.excess.size += viodev->cmo.entitled - desired;
-                       /*
-                        * If entitlement moving from the reserve pool to the
-                        * excess pool is currently unused, add to the excess
-                        * free counter.
-                        */
-                       if (viodev->cmo.allocated < viodev->cmo.entitled)
-                               vio_cmo.excess.free += viodev->cmo.entitled -
-                                                      max(viodev->cmo.allocated, desired);
-                       viodev->cmo.entitled = desired;
-               }
-       }
-       schedule_delayed_work(&vio_cmo.balance_q, 0);
-       spin_unlock_irqrestore(&vio_cmo.lock, flags);
-}
-
-/**
- * vio_cmo_bus_probe - Handle CMO specific bus probe activities
- *
- * @viodev - Pointer to struct vio_dev for device
- *
- * Determine the devices IO memory entitlement needs, attempting
- * to satisfy the system minimum entitlement at first and scheduling
- * a balance operation to take care of the rest at a later time.
- *
- * Returns: 0 on success, -EINVAL when device doesn't support CMO, and
- *          -ENOMEM when entitlement is not available for device or
- *          device entry.
- *
- */
-static int vio_cmo_bus_probe(struct vio_dev *viodev)
-{
-       struct vio_cmo_dev_entry *dev_ent;
-       struct device *dev = &viodev->dev;
-       struct iommu_table *tbl;
-       struct vio_driver *viodrv = to_vio_driver(dev->driver);
-       unsigned long flags;
-       size_t size;
-       bool dma_capable = false;
-
-       tbl = get_iommu_table_base(dev);
-
-       /* A device requires entitlement if it has a DMA window property */
-       switch (viodev->family) {
-       case VDEVICE:
-               if (of_get_property(viodev->dev.of_node,
-                                       "ibm,my-dma-window", NULL))
-                       dma_capable = true;
-               break;
-       case PFO:
-               dma_capable = false;
-               break;
-       default:
-               dev_warn(dev, "unknown device family: %d\n", viodev->family);
-               BUG();
-               break;
-       }
-
-       /* Configure entitlement for the device. */
-       if (dma_capable) {
-               /* Check that the driver is CMO enabled and get desired DMA */
-               if (!viodrv->get_desired_dma) {
-                       dev_err(dev, "%s: device driver does not support CMO\n",
-                               __func__);
-                       return -EINVAL;
-               }
-
-               viodev->cmo.desired =
-                       IOMMU_PAGE_ALIGN(viodrv->get_desired_dma(viodev), tbl);
-               if (viodev->cmo.desired < VIO_CMO_MIN_ENT)
-                       viodev->cmo.desired = VIO_CMO_MIN_ENT;
-               size = VIO_CMO_MIN_ENT;
-
-               dev_ent = kmalloc(sizeof(struct vio_cmo_dev_entry),
-                                 GFP_KERNEL);
-               if (!dev_ent)
-                       return -ENOMEM;
-
-               dev_ent->viodev = viodev;
-               spin_lock_irqsave(&vio_cmo.lock, flags);
-               list_add(&dev_ent->list, &vio_cmo.device_list);
-       } else {
-               viodev->cmo.desired = 0;
-               size = 0;
-               spin_lock_irqsave(&vio_cmo.lock, flags);
-       }
-
-       /*
-        * If the needs for vio_cmo.min have not changed since they
-        * were last set, the number of devices in the OF tree has
-        * been constant and the IO memory for this is already in
-        * the reserve pool.
-        */
-       if (vio_cmo.min == ((vio_cmo_num_OF_devs() + 1) *
-                           VIO_CMO_MIN_ENT)) {
-               /* Updated desired entitlement if device requires it */
-               if (size)
-                       vio_cmo.desired += (viodev->cmo.desired -
-                                       VIO_CMO_MIN_ENT);
-       } else {
-               size_t tmp;
-
-               tmp = vio_cmo.spare + vio_cmo.excess.free;
-               if (tmp < size) {
-                       dev_err(dev, "%s: insufficient free "
-                               "entitlement to add device. "
-                               "Need %lu, have %lu\n", __func__,
-                               size, (vio_cmo.spare + tmp));
-                       spin_unlock_irqrestore(&vio_cmo.lock, flags);
-                       return -ENOMEM;
-               }
-
-               /* Use excess pool first to fulfill request */
-               tmp = min(size, vio_cmo.excess.free);
-               vio_cmo.excess.free -= tmp;
-               vio_cmo.excess.size -= tmp;
-               vio_cmo.reserve.size += tmp;
-
-               /* Use spare if excess pool was insufficient */
-               vio_cmo.spare -= size - tmp;
-
-               /* Update bus accounting */
-               vio_cmo.min += size;
-               vio_cmo.desired += viodev->cmo.desired;
-       }
-       spin_unlock_irqrestore(&vio_cmo.lock, flags);
-       return 0;
-}
-
-/**
- * vio_cmo_bus_remove - Handle CMO specific bus removal activities
- *
- * @viodev - Pointer to struct vio_dev for device
- *
- * Remove the device from the cmo device list.  The minimum entitlement
- * will be reserved for the device as long as it is in the system.  The
- * rest of the entitlement the device had been allocated will be returned
- * to the system.
- */
-static void vio_cmo_bus_remove(struct vio_dev *viodev)
-{
-       struct vio_cmo_dev_entry *dev_ent;
-       unsigned long flags;
-       size_t tmp;
-
-       spin_lock_irqsave(&vio_cmo.lock, flags);
-       if (viodev->cmo.allocated) {
-               dev_err(&viodev->dev, "%s: device had %lu bytes of IO "
-                       "allocated after remove operation.\n",
-                       __func__, viodev->cmo.allocated);
-               BUG();
-       }
-
-       /*
-        * Remove the device from the device list being maintained for
-        * CMO enabled devices.
-        */
-       list_for_each_entry(dev_ent, &vio_cmo.device_list, list)
-               if (viodev == dev_ent->viodev) {
-                       list_del(&dev_ent->list);
-                       kfree(dev_ent);
-                       break;
-               }
-
-       /*
-        * Devices may not require any entitlement and they do not need
-        * to be processed.  Otherwise, return the device's entitlement
-        * back to the pools.
-        */
-       if (viodev->cmo.entitled) {
-               /*
-                * This device has not yet left the OF tree, it's
-                * minimum entitlement remains in vio_cmo.min and
-                * vio_cmo.desired
-                */
-               vio_cmo.desired -= (viodev->cmo.desired - VIO_CMO_MIN_ENT);
-
-               /*
-                * Save min allocation for device in reserve as long
-                * as it exists in OF tree as determined by later
-                * balance operation
-                */
-               viodev->cmo.entitled -= VIO_CMO_MIN_ENT;
-
-               /* Replenish spare from freed reserve pool */
-               if (viodev->cmo.entitled && (vio_cmo.spare < VIO_CMO_MIN_ENT)) {
-                       tmp = min(viodev->cmo.entitled, (VIO_CMO_MIN_ENT -
-                                                        vio_cmo.spare));
-                       vio_cmo.spare += tmp;
-                       viodev->cmo.entitled -= tmp;
-               }
-
-               /* Remaining reserve goes to excess pool */
-               vio_cmo.excess.size += viodev->cmo.entitled;
-               vio_cmo.excess.free += viodev->cmo.entitled;
-               vio_cmo.reserve.size -= viodev->cmo.entitled;
-
-               /*
-                * Until the device is removed it will keep a
-                * minimum entitlement; this will guarantee that
-                * a module unload/load will result in a success.
-                */
-               viodev->cmo.entitled = VIO_CMO_MIN_ENT;
-               viodev->cmo.desired = VIO_CMO_MIN_ENT;
-               atomic_set(&viodev->cmo.allocs_failed, 0);
-       }
-
-       spin_unlock_irqrestore(&vio_cmo.lock, flags);
-}
-
-static void vio_cmo_set_dma_ops(struct vio_dev *viodev)
-{
-       set_dma_ops(&viodev->dev, &vio_dma_mapping_ops);
-}
-
-/**
- * vio_cmo_bus_init - CMO entitlement initialization at bus init time
- *
- * Set up the reserve and excess entitlement pools based on available
- * system entitlement and the number of devices in the OF tree that
- * require entitlement in the reserve pool.
- */
-static void vio_cmo_bus_init(void)
-{
-       struct hvcall_mpp_data mpp_data;
-       int err;
-
-       memset(&vio_cmo, 0, sizeof(struct vio_cmo));
-       spin_lock_init(&vio_cmo.lock);
-       INIT_LIST_HEAD(&vio_cmo.device_list);
-       INIT_DELAYED_WORK(&vio_cmo.balance_q, vio_cmo_balance);
-
-       /* Get current system entitlement */
-       err = h_get_mpp(&mpp_data);
-
-       /*
-        * On failure, continue with entitlement set to 0, will panic()
-        * later when spare is reserved.
-        */
-       if (err != H_SUCCESS) {
-               printk(KERN_ERR "%s: unable to determine system IO "\
-                      "entitlement. (%d)\n", __func__, err);
-               vio_cmo.entitled = 0;
-       } else {
-               vio_cmo.entitled = mpp_data.entitled_mem;
-       }
-
-       /* Set reservation and check against entitlement */
-       vio_cmo.spare = VIO_CMO_MIN_ENT;
-       vio_cmo.reserve.size = vio_cmo.spare;
-       vio_cmo.reserve.size += (vio_cmo_num_OF_devs() *
-                                VIO_CMO_MIN_ENT);
-       if (vio_cmo.reserve.size > vio_cmo.entitled) {
-               printk(KERN_ERR "%s: insufficient system entitlement\n",
-                      __func__);
-               panic("%s: Insufficient system entitlement", __func__);
-       }
-
-       /* Set the remaining accounting variables */
-       vio_cmo.excess.size = vio_cmo.entitled - vio_cmo.reserve.size;
-       vio_cmo.excess.free = vio_cmo.excess.size;
-       vio_cmo.min = vio_cmo.reserve.size;
-       vio_cmo.desired = vio_cmo.reserve.size;
-}
-
-/* sysfs device functions and data structures for CMO */
-
-#define viodev_cmo_rd_attr(name)                                        \
-static ssize_t viodev_cmo_##name##_show(struct device *dev,             \
-                                        struct device_attribute *attr,  \
-                                         char *buf)                     \
-{                                                                       \
-       return sprintf(buf, "%lu\n", to_vio_dev(dev)->cmo.name);        \
-}
-
-static ssize_t viodev_cmo_allocs_failed_show(struct device *dev,
-               struct device_attribute *attr, char *buf)
-{
-       struct vio_dev *viodev = to_vio_dev(dev);
-       return sprintf(buf, "%d\n", atomic_read(&viodev->cmo.allocs_failed));
-}
-
-static ssize_t viodev_cmo_allocs_failed_reset(struct device *dev,
-               struct device_attribute *attr, const char *buf, size_t count)
-{
-       struct vio_dev *viodev = to_vio_dev(dev);
-       atomic_set(&viodev->cmo.allocs_failed, 0);
-       return count;
-}
-
-static ssize_t viodev_cmo_desired_set(struct device *dev,
-               struct device_attribute *attr, const char *buf, size_t count)
-{
-       struct vio_dev *viodev = to_vio_dev(dev);
-       size_t new_desired;
-       int ret;
-
-       ret = kstrtoul(buf, 10, &new_desired);
-       if (ret)
-               return ret;
-
-       vio_cmo_set_dev_desired(viodev, new_desired);
-       return count;
-}
-
-viodev_cmo_rd_attr(desired);
-viodev_cmo_rd_attr(entitled);
-viodev_cmo_rd_attr(allocated);
-
-static ssize_t name_show(struct device *, struct device_attribute *, char *);
-static ssize_t devspec_show(struct device *, struct device_attribute *, char *);
-static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
-                            char *buf);
-static struct device_attribute vio_cmo_dev_attrs[] = {
-       __ATTR_RO(name),
-       __ATTR_RO(devspec),
-       __ATTR_RO(modalias),
-       __ATTR(cmo_desired,       S_IWUSR|S_IRUSR|S_IWGRP|S_IRGRP|S_IROTH,
-              viodev_cmo_desired_show, viodev_cmo_desired_set),
-       __ATTR(cmo_entitled,      S_IRUGO, viodev_cmo_entitled_show,      NULL),
-       __ATTR(cmo_allocated,     S_IRUGO, viodev_cmo_allocated_show,     NULL),
-       __ATTR(cmo_allocs_failed, S_IWUSR|S_IRUSR|S_IWGRP|S_IRGRP|S_IROTH,
-              viodev_cmo_allocs_failed_show, viodev_cmo_allocs_failed_reset),
-       __ATTR_NULL
-};
-
-/* sysfs bus functions and data structures for CMO */
-
-#define viobus_cmo_rd_attr(name)                                        \
-static ssize_t cmo_##name##_show(struct bus_type *bt, char *buf)        \
-{                                                                       \
-       return sprintf(buf, "%lu\n", vio_cmo.name);                     \
-}                                                                       \
-static BUS_ATTR_RO(cmo_##name)
-
-#define viobus_cmo_pool_rd_attr(name, var)                              \
-static ssize_t                                                          \
-cmo_##name##_##var##_show(struct bus_type *bt, char *buf)               \
-{                                                                       \
-       return sprintf(buf, "%lu\n", vio_cmo.name.var);                 \
-}                                                                       \
-static BUS_ATTR_RO(cmo_##name##_##var)
-
-viobus_cmo_rd_attr(entitled);
-viobus_cmo_rd_attr(spare);
-viobus_cmo_rd_attr(min);
-viobus_cmo_rd_attr(desired);
-viobus_cmo_rd_attr(curr);
-viobus_cmo_pool_rd_attr(reserve, size);
-viobus_cmo_pool_rd_attr(excess, size);
-viobus_cmo_pool_rd_attr(excess, free);
-
-static ssize_t cmo_high_show(struct bus_type *bt, char *buf)
-{
-       return sprintf(buf, "%lu\n", vio_cmo.high);
-}
-
-static ssize_t cmo_high_store(struct bus_type *bt, const char *buf,
-                             size_t count)
-{
-       unsigned long flags;
-
-       spin_lock_irqsave(&vio_cmo.lock, flags);
-       vio_cmo.high = vio_cmo.curr;
-       spin_unlock_irqrestore(&vio_cmo.lock, flags);
-
-       return count;
-}
-static BUS_ATTR_RW(cmo_high);
-
-static struct attribute *vio_bus_attrs[] = {
-       &bus_attr_cmo_entitled.attr,
-       &bus_attr_cmo_spare.attr,
-       &bus_attr_cmo_min.attr,
-       &bus_attr_cmo_desired.attr,
-       &bus_attr_cmo_curr.attr,
-       &bus_attr_cmo_high.attr,
-       &bus_attr_cmo_reserve_size.attr,
-       &bus_attr_cmo_excess_size.attr,
-       &bus_attr_cmo_excess_free.attr,
-       NULL,
-};
-ATTRIBUTE_GROUPS(vio_bus);
-
-static void vio_cmo_sysfs_init(void)
-{
-       vio_bus_type.dev_attrs = vio_cmo_dev_attrs;
-       vio_bus_type.bus_groups = vio_bus_groups;
-}
-#else /* CONFIG_PPC_SMLPAR */
-int vio_cmo_entitlement_update(size_t new_entitlement) { return 0; }
-void vio_cmo_set_dev_desired(struct vio_dev *viodev, size_t desired) {}
-static int vio_cmo_bus_probe(struct vio_dev *viodev) { return 0; }
-static void vio_cmo_bus_remove(struct vio_dev *viodev) {}
-static void vio_cmo_set_dma_ops(struct vio_dev *viodev) {}
-static void vio_cmo_bus_init(void) {}
-static void vio_cmo_sysfs_init(void) { }
-#endif /* CONFIG_PPC_SMLPAR */
-EXPORT_SYMBOL(vio_cmo_entitlement_update);
-EXPORT_SYMBOL(vio_cmo_set_dev_desired);
-
-
-/*
- * Platform Facilities Option (PFO) support
- */
-
-/**
- * vio_h_cop_sync - Perform a synchronous PFO co-processor operation
- *
- * @vdev - Pointer to a struct vio_dev for device
- * @op - Pointer to a struct vio_pfo_op for the operation parameters
- *
- * Calls the hypervisor to synchronously perform the PFO operation
- * described in @op.  In the case of a busy response from the hypervisor,
- * the operation will be re-submitted indefinitely unless a non-zero timeout
- * is specified or an error occurs. The timeout places a limit on when to
- * stop re-submitting a operation, the total time can be exceeded if an
- * operation is in progress.
- *
- * If op->hcall_ret is not NULL, this will be set to the return from the
- * last h_cop_op call or it will be 0 if an error not involving the h_call
- * was encountered.
- *
- * Returns:
- *     0 on success,
- *     -EINVAL if the h_call fails due to an invalid parameter,
- *     -E2BIG if the h_call can not be performed synchronously,
- *     -EBUSY if a timeout is specified and has elapsed,
- *     -EACCES if the memory area for data/status has been rescinded, or
- *     -EPERM if a hardware fault has been indicated
- */
-int vio_h_cop_sync(struct vio_dev *vdev, struct vio_pfo_op *op)
-{
-       struct device *dev = &vdev->dev;
-       unsigned long deadline = 0;
-       long hret = 0;
-       int ret = 0;
-
-       if (op->timeout)
-               deadline = jiffies + msecs_to_jiffies(op->timeout);
-
-       while (true) {
-               hret = plpar_hcall_norets(H_COP, op->flags,
-                               vdev->resource_id,
-                               op->in, op->inlen, op->out,
-                               op->outlen, op->csbcpb);
-
-               if (hret == H_SUCCESS ||
-                   (hret != H_NOT_ENOUGH_RESOURCES &&
-                    hret != H_BUSY && hret != H_RESOURCE) ||
-                   (op->timeout && time_after(deadline, jiffies)))
-                       break;
-
-               dev_dbg(dev, "%s: hcall ret(%ld), retrying.\n", __func__, hret);
-       }
-
-       switch (hret) {
-       case H_SUCCESS:
-               ret = 0;
-               break;
-       case H_OP_MODE:
-       case H_TOO_BIG:
-               ret = -E2BIG;
-               break;
-       case H_RESCINDED:
-               ret = -EACCES;
-               break;
-       case H_HARDWARE:
-               ret = -EPERM;
-               break;
-       case H_NOT_ENOUGH_RESOURCES:
-       case H_RESOURCE:
-       case H_BUSY:
-               ret = -EBUSY;
-               break;
-       default:
-               ret = -EINVAL;
-               break;
-       }
-
-       if (ret)
-               dev_dbg(dev, "%s: Sync h_cop_op failure (ret:%d) (hret:%ld)\n",
-                               __func__, ret, hret);
-
-       op->hcall_err = hret;
-       return ret;
-}
-EXPORT_SYMBOL(vio_h_cop_sync);
-
-static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev)
-{
-       const __be32 *dma_window;
-       struct iommu_table *tbl;
-       unsigned long offset, size;
-
-       dma_window = of_get_property(dev->dev.of_node,
-                                 "ibm,my-dma-window", NULL);
-       if (!dma_window)
-               return NULL;
-
-       tbl = kzalloc(sizeof(*tbl), GFP_KERNEL);
-       if (tbl == NULL)
-               return NULL;
-
-       of_parse_dma_window(dev->dev.of_node, dma_window,
-                           &tbl->it_index, &offset, &size);
-
-       /* TCE table size - measured in tce entries */
-       tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K;
-       tbl->it_size = size >> tbl->it_page_shift;
-       /* offset for VIO should always be 0 */
-       tbl->it_offset = offset >> tbl->it_page_shift;
-       tbl->it_busno = 0;
-       tbl->it_type = TCE_VB;
-       tbl->it_blocksize = 16;
-
-       if (firmware_has_feature(FW_FEATURE_LPAR))
-               tbl->it_ops = &iommu_table_lpar_multi_ops;
-       else
-               tbl->it_ops = &iommu_table_pseries_ops;
-
-       return iommu_init_table(tbl, -1);
-}
-
-/**
- * vio_match_device: - Tell if a VIO device has a matching
- *                     VIO device id structure.
- * @ids:       array of VIO device id structures to search in
- * @dev:       the VIO device structure to match against
- *
- * Used by a driver to check whether a VIO device present in the
- * system is in its list of supported devices. Returns the matching
- * vio_device_id structure or NULL if there is no match.
- */
-static const struct vio_device_id *vio_match_device(
-               const struct vio_device_id *ids, const struct vio_dev *dev)
-{
-       while (ids->type[0] != '\0') {
-               if ((strncmp(dev->type, ids->type, strlen(ids->type)) == 0) &&
-                   of_device_is_compatible(dev->dev.of_node,
-                                        ids->compat))
-                       return ids;
-               ids++;
-       }
-       return NULL;
-}
-
-/*
- * Convert from struct device to struct vio_dev and pass to driver.
- * dev->driver has already been set by generic code because vio_bus_match
- * succeeded.
- */
-static int vio_bus_probe(struct device *dev)
-{
-       struct vio_dev *viodev = to_vio_dev(dev);
-       struct vio_driver *viodrv = to_vio_driver(dev->driver);
-       const struct vio_device_id *id;
-       int error = -ENODEV;
-
-       if (!viodrv->probe)
-               return error;
-
-       id = vio_match_device(viodrv->id_table, viodev);
-       if (id) {
-               memset(&viodev->cmo, 0, sizeof(viodev->cmo));
-               if (firmware_has_feature(FW_FEATURE_CMO)) {
-                       error = vio_cmo_bus_probe(viodev);
-                       if (error)
-                               return error;
-               }
-               error = viodrv->probe(viodev, id);
-               if (error && firmware_has_feature(FW_FEATURE_CMO))
-                       vio_cmo_bus_remove(viodev);
-       }
-
-       return error;
-}
-
-/* convert from struct device to struct vio_dev and pass to driver. */
-static int vio_bus_remove(struct device *dev)
-{
-       struct vio_dev *viodev = to_vio_dev(dev);
-       struct vio_driver *viodrv = to_vio_driver(dev->driver);
-       struct device *devptr;
-       int ret = 1;
-
-       /*
-        * Hold a reference to the device after the remove function is called
-        * to allow for CMO accounting cleanup for the device.
-        */
-       devptr = get_device(dev);
-
-       if (viodrv->remove)
-               ret = viodrv->remove(viodev);
-
-       if (!ret && firmware_has_feature(FW_FEATURE_CMO))
-               vio_cmo_bus_remove(viodev);
-
-       put_device(devptr);
-       return ret;
-}
-
-/**
- * vio_register_driver: - Register a new vio driver
- * @viodrv:    The vio_driver structure to be registered.
- */
-int __vio_register_driver(struct vio_driver *viodrv, struct module *owner,
-                         const char *mod_name)
-{
-       pr_debug("%s: driver %s registering\n", __func__, viodrv->name);
-
-       /* fill in 'struct driver' fields */
-       viodrv->driver.name = viodrv->name;
-       viodrv->driver.pm = viodrv->pm;
-       viodrv->driver.bus = &vio_bus_type;
-       viodrv->driver.owner = owner;
-       viodrv->driver.mod_name = mod_name;
-
-       return driver_register(&viodrv->driver);
-}
-EXPORT_SYMBOL(__vio_register_driver);
-
-/**
- * vio_unregister_driver - Remove registration of vio driver.
- * @viodrv:    The vio_driver struct to be removed form registration
- */
-void vio_unregister_driver(struct vio_driver *viodrv)
-{
-       driver_unregister(&viodrv->driver);
-}
-EXPORT_SYMBOL(vio_unregister_driver);
-
-/* vio_dev refcount hit 0 */
-static void vio_dev_release(struct device *dev)
-{
-       struct iommu_table *tbl = get_iommu_table_base(dev);
-
-       if (tbl)
-               iommu_free_table(tbl, of_node_full_name(dev->of_node));
-       of_node_put(dev->of_node);
-       kfree(to_vio_dev(dev));
-}
-
-/**
- * vio_register_device_node: - Register a new vio device.
- * @of_node:   The OF node for this device.
- *
- * Creates and initializes a vio_dev structure from the data in
- * of_node and adds it to the list of virtual devices.
- * Returns a pointer to the created vio_dev or NULL if node has
- * NULL device_type or compatible fields.
- */
-struct vio_dev *vio_register_device_node(struct device_node *of_node)
-{
-       struct vio_dev *viodev;
-       struct device_node *parent_node;
-       const __be32 *prop;
-       enum vio_dev_family family;
-       const char *of_node_name = of_node->name ? of_node->name : "<unknown>";
-
-       /*
-        * Determine if this node is a under the /vdevice node or under the
-        * /ibm,platform-facilities node.  This decides the device's family.
-        */
-       parent_node = of_get_parent(of_node);
-       if (parent_node) {
-               if (!strcmp(parent_node->full_name, "/ibm,platform-facilities"))
-                       family = PFO;
-               else if (!strcmp(parent_node->full_name, "/vdevice"))
-                       family = VDEVICE;
-               else {
-                       pr_warn("%s: parent(%s) of %s not recognized.\n",
-                                       __func__,
-                                       parent_node->full_name,
-                                       of_node_name);
-                       of_node_put(parent_node);
-                       return NULL;
-               }
-               of_node_put(parent_node);
-       } else {
-               pr_warn("%s: could not determine the parent of node %s.\n",
-                               __func__, of_node_name);
-               return NULL;
-       }
-
-       if (family == PFO) {
-               if (of_get_property(of_node, "interrupt-controller", NULL)) {
-                       pr_debug("%s: Skipping the interrupt controller %s.\n",
-                                       __func__, of_node_name);
-                       return NULL;
-               }
-       }
-
-       /* allocate a vio_dev for this node */
-       viodev = kzalloc(sizeof(struct vio_dev), GFP_KERNEL);
-       if (viodev == NULL) {
-               pr_warn("%s: allocation failure for VIO device.\n", __func__);
-               return NULL;
-       }
-
-       /* we need the 'device_type' property, in order to match with drivers */
-       viodev->family = family;
-       if (viodev->family == VDEVICE) {
-               unsigned int unit_address;
-
-               if (of_node->type != NULL)
-                       viodev->type = of_node->type;
-               else {
-                       pr_warn("%s: node %s is missing the 'device_type' "
-                                       "property.\n", __func__, of_node_name);
-                       goto out;
-               }
-
-               prop = of_get_property(of_node, "reg", NULL);
-               if (prop == NULL) {
-                       pr_warn("%s: node %s missing 'reg'\n",
-                                       __func__, of_node_name);
-                       goto out;
-               }
-               unit_address = of_read_number(prop, 1);
-               dev_set_name(&viodev->dev, "%x", unit_address);
-               viodev->irq = irq_of_parse_and_map(of_node, 0);
-               viodev->unit_address = unit_address;
-       } else {
-               /* PFO devices need their resource_id for submitting COP_OPs
-                * This is an optional field for devices, but is required when
-                * performing synchronous ops */
-               prop = of_get_property(of_node, "ibm,resource-id", NULL);
-               if (prop != NULL)
-                       viodev->resource_id = of_read_number(prop, 1);
-
-               dev_set_name(&viodev->dev, "%s", of_node_name);
-               viodev->type = of_node_name;
-               viodev->irq = 0;
-       }
-
-       viodev->name = of_node->name;
-       viodev->dev.of_node = of_node_get(of_node);
-
-       set_dev_node(&viodev->dev, of_node_to_nid(of_node));
-
-       /* init generic 'struct device' fields: */
-       viodev->dev.parent = &vio_bus_device.dev;
-       viodev->dev.bus = &vio_bus_type;
-       viodev->dev.release = vio_dev_release;
-
-       if (of_get_property(viodev->dev.of_node, "ibm,my-dma-window", NULL)) {
-               if (firmware_has_feature(FW_FEATURE_CMO))
-                       vio_cmo_set_dma_ops(viodev);
-               else
-                       set_dma_ops(&viodev->dev, &dma_iommu_ops);
-
-               set_iommu_table_base(&viodev->dev,
-                                    vio_build_iommu_table(viodev));
-
-               /* needed to ensure proper operation of coherent allocations
-                * later, in case driver doesn't set it explicitly */
-               viodev->dev.coherent_dma_mask = DMA_BIT_MASK(64);
-               viodev->dev.dma_mask = &viodev->dev.coherent_dma_mask;
-       }
-
-       /* register with generic device framework */
-       if (device_register(&viodev->dev)) {
-               printk(KERN_ERR "%s: failed to register device %s\n",
-                               __func__, dev_name(&viodev->dev));
-               put_device(&viodev->dev);
-               return NULL;
-       }
-
-       return viodev;
-
-out:   /* Use this exit point for any return prior to device_register */
-       kfree(viodev);
-
-       return NULL;
-}
-EXPORT_SYMBOL(vio_register_device_node);
-
-/*
- * vio_bus_scan_for_devices - Scan OF and register each child device
- * @root_name - OF node name for the root of the subtree to search.
- *             This must be non-NULL
- *
- * Starting from the root node provide, register the device node for
- * each child beneath the root.
- */
-static void vio_bus_scan_register_devices(char *root_name)
-{
-       struct device_node *node_root, *node_child;
-
-       if (!root_name)
-               return;
-
-       node_root = of_find_node_by_name(NULL, root_name);
-       if (node_root) {
-
-               /*
-                * Create struct vio_devices for each virtual device in
-                * the device tree. Drivers will associate with them later.
-                */
-               node_child = of_get_next_child(node_root, NULL);
-               while (node_child) {
-                       vio_register_device_node(node_child);
-                       node_child = of_get_next_child(node_root, node_child);
-               }
-               of_node_put(node_root);
-       }
-}
-
-/**
- * vio_bus_init: - Initialize the virtual IO bus
- */
-static int __init vio_bus_init(void)
-{
-       int err;
-
-       if (firmware_has_feature(FW_FEATURE_CMO))
-               vio_cmo_sysfs_init();
-
-       err = bus_register(&vio_bus_type);
-       if (err) {
-               printk(KERN_ERR "failed to register VIO bus\n");
-               return err;
-       }
-
-       /*
-        * The fake parent of all vio devices, just to give us
-        * a nice directory
-        */
-       err = device_register(&vio_bus_device.dev);
-       if (err) {
-               printk(KERN_WARNING "%s: device_register returned %i\n",
-                               __func__, err);
-               return err;
-       }
-
-       if (firmware_has_feature(FW_FEATURE_CMO))
-               vio_cmo_bus_init();
-
-       return 0;
-}
-postcore_initcall(vio_bus_init);
-
-static int __init vio_device_init(void)
-{
-       vio_bus_scan_register_devices("vdevice");
-       vio_bus_scan_register_devices("ibm,platform-facilities");
-
-       return 0;
-}
-device_initcall(vio_device_init);
-
-static ssize_t name_show(struct device *dev,
-               struct device_attribute *attr, char *buf)
-{
-       return sprintf(buf, "%s\n", to_vio_dev(dev)->name);
-}
-
-static ssize_t devspec_show(struct device *dev,
-               struct device_attribute *attr, char *buf)
-{
-       struct device_node *of_node = dev->of_node;
-
-       return sprintf(buf, "%s\n", of_node_full_name(of_node));
-}
-
-static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
-                            char *buf)
-{
-       const struct vio_dev *vio_dev = to_vio_dev(dev);
-       struct device_node *dn;
-       const char *cp;
-
-       dn = dev->of_node;
-       if (!dn) {
-               strcpy(buf, "\n");
-               return strlen(buf);
-       }
-       cp = of_get_property(dn, "compatible", NULL);
-       if (!cp) {
-               strcpy(buf, "\n");
-               return strlen(buf);
-       }
-
-       return sprintf(buf, "vio:T%sS%s\n", vio_dev->type, cp);
-}
-
-static struct device_attribute vio_dev_attrs[] = {
-       __ATTR_RO(name),
-       __ATTR_RO(devspec),
-       __ATTR_RO(modalias),
-       __ATTR_NULL
-};
-
-void vio_unregister_device(struct vio_dev *viodev)
-{
-       device_unregister(&viodev->dev);
-}
-EXPORT_SYMBOL(vio_unregister_device);
-
-static int vio_bus_match(struct device *dev, struct device_driver *drv)
-{
-       const struct vio_dev *vio_dev = to_vio_dev(dev);
-       struct vio_driver *vio_drv = to_vio_driver(drv);
-       const struct vio_device_id *ids = vio_drv->id_table;
-
-       return (ids != NULL) && (vio_match_device(ids, vio_dev) != NULL);
-}
-
-static int vio_hotplug(struct device *dev, struct kobj_uevent_env *env)
-{
-       const struct vio_dev *vio_dev = to_vio_dev(dev);
-       struct device_node *dn;
-       const char *cp;
-
-       dn = dev->of_node;
-       if (!dn)
-               return -ENODEV;
-       cp = of_get_property(dn, "compatible", NULL);
-       if (!cp)
-               return -ENODEV;
-
-       add_uevent_var(env, "MODALIAS=vio:T%sS%s", vio_dev->type, cp);
-       return 0;
-}
-
-struct bus_type vio_bus_type = {
-       .name = "vio",
-       .dev_attrs = vio_dev_attrs,
-       .uevent = vio_hotplug,
-       .match = vio_bus_match,
-       .probe = vio_bus_probe,
-       .remove = vio_bus_remove,
-};
-
-/**
- * vio_get_attribute: - get attribute for virtual device
- * @vdev:      The vio device to get property.
- * @which:     The property/attribute to be extracted.
- * @length:    Pointer to length of returned data size (unused if NULL).
- *
- * Calls prom.c's of_get_property() to return the value of the
- * attribute specified by @which
-*/
-const void *vio_get_attribute(struct vio_dev *vdev, char *which, int *length)
-{
-       return of_get_property(vdev->dev.of_node, which, length);
-}
-EXPORT_SYMBOL(vio_get_attribute);
-
-#ifdef CONFIG_PPC_PSERIES
-/* vio_find_name() - internal because only vio.c knows how we formatted the
- * kobject name
- */
-static struct vio_dev *vio_find_name(const char *name)
-{
-       struct device *found;
-
-       found = bus_find_device_by_name(&vio_bus_type, NULL, name);
-       if (!found)
-               return NULL;
-
-       return to_vio_dev(found);
-}
-
-/**
- * vio_find_node - find an already-registered vio_dev
- * @vnode: device_node of the virtual device we're looking for
- *
- * Takes a reference to the embedded struct device which needs to be dropped
- * after use.
- */
-struct vio_dev *vio_find_node(struct device_node *vnode)
-{
-       char kobj_name[20];
-       struct device_node *vnode_parent;
-       const char *dev_type;
-
-       vnode_parent = of_get_parent(vnode);
-       if (!vnode_parent)
-               return NULL;
-
-       dev_type = of_get_property(vnode_parent, "device_type", NULL);
-       of_node_put(vnode_parent);
-       if (!dev_type)
-               return NULL;
-
-       /* construct the kobject name from the device node */
-       if (!strcmp(dev_type, "vdevice")) {
-               const __be32 *prop;
-               
-               prop = of_get_property(vnode, "reg", NULL);
-               if (!prop)
-                       return NULL;
-               snprintf(kobj_name, sizeof(kobj_name), "%x",
-                        (uint32_t)of_read_number(prop, 1));
-       } else if (!strcmp(dev_type, "ibm,platform-facilities"))
-               snprintf(kobj_name, sizeof(kobj_name), "%s", vnode->name);
-       else
-               return NULL;
-
-       return vio_find_name(kobj_name);
-}
-EXPORT_SYMBOL(vio_find_node);
-
-int vio_enable_interrupts(struct vio_dev *dev)
-{
-       int rc = h_vio_signal(dev->unit_address, VIO_IRQ_ENABLE);
-       if (rc != H_SUCCESS)
-               printk(KERN_ERR "vio: Error 0x%x enabling interrupts\n", rc);
-       return rc;
-}
-EXPORT_SYMBOL(vio_enable_interrupts);
-
-int vio_disable_interrupts(struct vio_dev *dev)
-{
-       int rc = h_vio_signal(dev->unit_address, VIO_IRQ_DISABLE);
-       if (rc != H_SUCCESS)
-               printk(KERN_ERR "vio: Error 0x%x disabling interrupts\n", rc);
-       return rc;
-}
-EXPORT_SYMBOL(vio_disable_interrupts);
-#endif /* CONFIG_PPC_PSERIES */
index b64287c6793f15be5d9987ae1544181c20453923..9c78a9c102c3a9fd9bc6f89fe0d326284e9c4ab1 100644 (file)
@@ -15,6 +15,7 @@
 #include <asm/sstep.h>
 #include <asm/processor.h>
 #include <asm/uaccess.h>
+#include <asm/cpu_has_feature.h>
 #include <asm/cputable.h>
 
 extern char system_call_common[];
index 3f4d338985fcc60224bc0f2e35860f924abc83a2..7414034df1c364682ee8736d72c09927bea7ff24 100644 (file)
@@ -43,5 +43,5 @@ obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
 obj-$(CONFIG_HIGHMEM)          += highmem.o
 obj-$(CONFIG_PPC_COPRO_BASE)   += copro_fault.o
 obj-$(CONFIG_SPAPR_TCE_IOMMU)  += mmu_context_iommu.o
-obj-$(CONFIG_PPC_PTDUMP)       += dump_linuxpagetables.o \
-                                  dump_hashpagetable.o
+obj-$(CONFIG_PPC_PTDUMP)       += dump_linuxpagetables.o
+obj-$(CONFIG_PPC_HTDUMP)       += dump_hashpagetable.o
index 362954f98029b46d4d3d312b239bb7a2fa8fe63a..aaa7ec6788b9ee5b69da6a1004808a8d60f8ecc2 100644 (file)
@@ -134,6 +134,9 @@ int copro_calculate_slb(struct mm_struct *mm, u64 ea, struct copro_slb *slb)
                pr_debug("%s: invalid region access at %016llx\n", __func__, ea);
                return 1;
        }
+       /* Bad address */
+       if (!vsid)
+               return 1;
 
        vsid = (vsid << slb_vsid_shift(ssize)) | vsidkey;
 
index d242bc79ae642c7dd9435a5f9b1c91b24826bb5a..49abaf4dc8e3a602ba6ea565cf940fbfddfb770a 100644 (file)
@@ -159,6 +159,7 @@ static const struct flag_info flag_array[] = {
                .set    = "no cache",
                .clear  = "        ",
        }, {
+#ifdef CONFIG_PPC_BOOK3S_64
                .mask   = H_PAGE_BUSY,
                .val    = H_PAGE_BUSY,
                .set    = "busy",
@@ -183,6 +184,7 @@ static const struct flag_info flag_array[] = {
                .val    = H_PAGE_F_SECOND,
                .set    = "f_second",
        }, {
+#endif
                .mask   = _PAGE_SPECIAL,
                .val    = _PAGE_SPECIAL,
                .set    = "special",
index 73932f4a386e75082e622cd60049779c2d33150a..6fd30ac7d14a0d2761e82d479fe01724bd53a38d 100644 (file)
@@ -390,6 +390,20 @@ good_area:
 #endif /* CONFIG_8xx */
 
        if (is_exec) {
+               /*
+                * An execution fault + no execute ?
+                *
+                * On CPUs that don't have CPU_FTR_COHERENT_ICACHE we
+                * deliberately create NX mappings, and use the fault to do the
+                * cache flush. This is usually handled in hash_page_do_lazy_icache()
+                * but we could end up here if that races with a concurrent PTE
+                * update. In that case we need to fall through here to the VMA
+                * check below.
+                */
+               if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE) &&
+                       (regs->msr & SRR1_ISI_N_OR_G))
+                       goto bad_area;
+
                /*
                 * Allow execution from readable areas if the MMU does not
                 * provide separate controls over reading and executing.
@@ -404,6 +418,7 @@ good_area:
                    (cpu_has_feature(CPU_FTR_NOEXECUTE) ||
                     !(vma->vm_flags & (VM_READ | VM_WRITE))))
                        goto bad_area;
+
 #ifdef CONFIG_PPC_STD_MMU
                /*
                 * protfault should only happen due to us
index 9d9b3eff123e64085645d89916e59c7fb90239b0..cc332608e65664f40f337cd4418c70a164b6259d 100644 (file)
@@ -223,13 +223,18 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
                return -1;
 
        hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
-       hpte_r = hpte_encode_r(pa, psize, apsize, ssize) | rflags;
+       hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
 
        if (!(vflags & HPTE_V_BOLTED)) {
                DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n",
                        i, hpte_v, hpte_r);
        }
 
+       if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+               hpte_r = hpte_old_to_new_r(hpte_v, hpte_r);
+               hpte_v = hpte_old_to_new_v(hpte_v);
+       }
+
        hptep->r = cpu_to_be64(hpte_r);
        /* Guarantee the second dword is visible before the valid bit */
        eieio();
@@ -297,6 +302,8 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
                vpn, want_v & HPTE_V_AVPN, slot, newpp);
 
        hpte_v = be64_to_cpu(hptep->v);
+       if (cpu_has_feature(CPU_FTR_ARCH_300))
+               hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r));
        /*
         * We need to invalidate the TLB always because hpte_remove doesn't do
         * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
@@ -311,6 +318,8 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
                native_lock_hpte(hptep);
                /* recheck with locks held */
                hpte_v = be64_to_cpu(hptep->v);
+               if (cpu_has_feature(CPU_FTR_ARCH_300))
+                       hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r));
                if (unlikely(!HPTE_V_COMPARE(hpte_v, want_v) ||
                             !(hpte_v & HPTE_V_VALID))) {
                        ret = -1;
@@ -352,6 +361,8 @@ static long native_hpte_find(unsigned long vpn, int psize, int ssize)
        for (i = 0; i < HPTES_PER_GROUP; i++) {
                hptep = htab_address + slot;
                hpte_v = be64_to_cpu(hptep->v);
+               if (cpu_has_feature(CPU_FTR_ARCH_300))
+                       hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r));
 
                if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
                        /* HPTE matches */
@@ -411,6 +422,8 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
        want_v = hpte_encode_avpn(vpn, bpsize, ssize);
        native_lock_hpte(hptep);
        hpte_v = be64_to_cpu(hptep->v);
+       if (cpu_has_feature(CPU_FTR_ARCH_300))
+               hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r));
 
        /*
         * We need to invalidate the TLB always because hpte_remove doesn't do
@@ -469,6 +482,8 @@ static void native_hugepage_invalidate(unsigned long vsid,
                want_v = hpte_encode_avpn(vpn, psize, ssize);
                native_lock_hpte(hptep);
                hpte_v = be64_to_cpu(hptep->v);
+               if (cpu_has_feature(CPU_FTR_ARCH_300))
+                       hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r));
 
                /* Even if we miss, we need to invalidate the TLB */
                if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
@@ -506,6 +521,10 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
        /* Look at the 8 bit LP value */
        unsigned int lp = (hpte_r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
 
+       if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+               hpte_v = hpte_new_to_old_v(hpte_v, hpte_r);
+               hpte_r = hpte_new_to_old_r(hpte_r);
+       }
        if (!(hpte_v & HPTE_V_LARGE)) {
                size   = MMU_PAGE_4K;
                a_size = MMU_PAGE_4K;
@@ -514,11 +533,7 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
                a_size = hpte_page_sizes[lp] >> 4;
        }
        /* This works for all page sizes, and for 256M and 1T segments */
-       if (cpu_has_feature(CPU_FTR_ARCH_300))
-               *ssize = hpte_r >> HPTE_R_3_0_SSIZE_SHIFT;
-       else
-               *ssize = hpte_v >> HPTE_V_SSIZE_SHIFT;
-
+       *ssize = hpte_v >> HPTE_V_SSIZE_SHIFT;
        shift = mmu_psize_defs[size].shift;
 
        avpn = (HPTE_V_AVPN_VAL(hpte_v) & ~mmu_psize_defs[size].avpnm);
@@ -641,6 +656,9 @@ static void native_flush_hash_range(unsigned long number, int local)
                        want_v = hpte_encode_avpn(vpn, psize, ssize);
                        native_lock_hpte(hptep);
                        hpte_v = be64_to_cpu(hptep->v);
+                       if (cpu_has_feature(CPU_FTR_ARCH_300))
+                               hpte_v = hpte_new_to_old_v(hpte_v,
+                                               be64_to_cpu(hptep->r));
                        if (!HPTE_V_COMPARE(hpte_v, want_v) ||
                            !(hpte_v & HPTE_V_VALID))
                                native_unlock_hpte(hptep);
index 44d3c3a38e3ecc71c779b4a09f07e2c12261a99b..b9a062f5805b3cb428d0de5dd03e3604d1e2aa70 100644 (file)
@@ -792,37 +792,17 @@ static void update_hid_for_hash(void)
 static void __init hash_init_partition_table(phys_addr_t hash_table,
                                             unsigned long htab_size)
 {
-       unsigned long ps_field;
-       unsigned long patb_size = 1UL << PATB_SIZE_SHIFT;
+       mmu_partition_table_init();
 
        /*
-        * slb llp encoding for the page size used in VPM real mode.
-        * We can ignore that for lpid 0
+        * PS field (VRMA page size) is not used for LPID 0, hence set to 0.
+        * For now, UPRT is 0 and we have no segment table.
         */
-       ps_field = 0;
        htab_size =  __ilog2(htab_size) - 18;
-
-       BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 24), "Partition table size too large.");
-       partition_tb = __va(memblock_alloc_base(patb_size, patb_size,
-                                               MEMBLOCK_ALLOC_ANYWHERE));
-
-       /* Initialize the Partition Table with no entries */
-       memset((void *)partition_tb, 0, patb_size);
-       partition_tb->patb0 = cpu_to_be64(ps_field | hash_table | htab_size);
-       /*
-        * FIXME!! This should be done via update_partition table
-        * For now UPRT is 0 for us.
-        */
-       partition_tb->patb1 = 0;
+       mmu_partition_table_set_entry(0, hash_table | htab_size, 0);
        pr_info("Partition table %p\n", partition_tb);
        if (cpu_has_feature(CPU_FTR_POWER9_DD1))
                update_hid_for_hash();
-       /*
-        * update partition table control register,
-        * 64 K size.
-        */
-       mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
-
 }
 
 static void __init htab_initialize(void)
index b114f8b93ec92145e5a541dbcc8c3c7708cf590e..73bf6e14c3aa04c082921c3691c2cd48320c2892 100644 (file)
@@ -115,7 +115,7 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
        mm->context.pte_frag = NULL;
 #endif
 #ifdef CONFIG_SPAPR_TCE_IOMMU
-       mm_iommu_init(&mm->context);
+       mm_iommu_init(mm);
 #endif
        return 0;
 }
@@ -156,13 +156,11 @@ static inline void destroy_pagetable_page(struct mm_struct *mm)
 }
 #endif
 
-
 void destroy_context(struct mm_struct *mm)
 {
 #ifdef CONFIG_SPAPR_TCE_IOMMU
-       mm_iommu_cleanup(&mm->context);
+       WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list));
 #endif
-
 #ifdef CONFIG_PPC_ICSWX
        drop_cop(mm->context.acop, mm);
        kfree(mm->context.cop_lockp);
index e0f1c33601ddb1ab6bc3e780b1ed43f9f80a1e58..104bad029ce9b98afff249f5441515c6b4d625f5 100644 (file)
@@ -56,7 +56,7 @@ static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
        }
 
        pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n",
-                       current->pid,
+                       current ? current->pid : 0,
                        incr ? '+' : '-',
                        npages << PAGE_SHIFT,
                        mm->locked_vm << PAGE_SHIFT,
@@ -66,12 +66,9 @@ static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
        return ret;
 }
 
-bool mm_iommu_preregistered(void)
+bool mm_iommu_preregistered(struct mm_struct *mm)
 {
-       if (!current || !current->mm)
-               return false;
-
-       return !list_empty(&current->mm->context.iommu_group_mem_list);
+       return !list_empty(&mm->context.iommu_group_mem_list);
 }
 EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
 
@@ -124,19 +121,16 @@ static int mm_iommu_move_page_from_cma(struct page *page)
        return 0;
 }
 
-long mm_iommu_get(unsigned long ua, unsigned long entries,
+long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries,
                struct mm_iommu_table_group_mem_t **pmem)
 {
        struct mm_iommu_table_group_mem_t *mem;
        long i, j, ret = 0, locked_entries = 0;
        struct page *page = NULL;
 
-       if (!current || !current->mm)
-               return -ESRCH; /* process exited */
-
        mutex_lock(&mem_list_mutex);
 
-       list_for_each_entry_rcu(mem, &current->mm->context.iommu_group_mem_list,
+       list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list,
                        next) {
                if ((mem->ua == ua) && (mem->entries == entries)) {
                        ++mem->used;
@@ -154,7 +148,7 @@ long mm_iommu_get(unsigned long ua, unsigned long entries,
 
        }
 
-       ret = mm_iommu_adjust_locked_vm(current->mm, entries, true);
+       ret = mm_iommu_adjust_locked_vm(mm, entries, true);
        if (ret)
                goto unlock_exit;
 
@@ -215,11 +209,11 @@ populate:
        mem->entries = entries;
        *pmem = mem;
 
-       list_add_rcu(&mem->next, &current->mm->context.iommu_group_mem_list);
+       list_add_rcu(&mem->next, &mm->context.iommu_group_mem_list);
 
 unlock_exit:
        if (locked_entries && ret)
-               mm_iommu_adjust_locked_vm(current->mm, locked_entries, false);
+               mm_iommu_adjust_locked_vm(mm, locked_entries, false);
 
        mutex_unlock(&mem_list_mutex);
 
@@ -264,17 +258,13 @@ static void mm_iommu_free(struct rcu_head *head)
 static void mm_iommu_release(struct mm_iommu_table_group_mem_t *mem)
 {
        list_del_rcu(&mem->next);
-       mm_iommu_adjust_locked_vm(current->mm, mem->entries, false);
        call_rcu(&mem->rcu, mm_iommu_free);
 }
 
-long mm_iommu_put(struct mm_iommu_table_group_mem_t *mem)
+long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem)
 {
        long ret = 0;
 
-       if (!current || !current->mm)
-               return -ESRCH; /* process exited */
-
        mutex_lock(&mem_list_mutex);
 
        if (mem->used == 0) {
@@ -297,6 +287,8 @@ long mm_iommu_put(struct mm_iommu_table_group_mem_t *mem)
        /* @mapped became 0 so now mappings are disabled, release the region */
        mm_iommu_release(mem);
 
+       mm_iommu_adjust_locked_vm(mm, mem->entries, false);
+
 unlock_exit:
        mutex_unlock(&mem_list_mutex);
 
@@ -304,14 +296,12 @@ unlock_exit:
 }
 EXPORT_SYMBOL_GPL(mm_iommu_put);
 
-struct mm_iommu_table_group_mem_t *mm_iommu_lookup(unsigned long ua,
-               unsigned long size)
+struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
+               unsigned long ua, unsigned long size)
 {
        struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
 
-       list_for_each_entry_rcu(mem,
-                       &current->mm->context.iommu_group_mem_list,
-                       next) {
+       list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
                if ((mem->ua <= ua) &&
                                (ua + size <= mem->ua +
                                 (mem->entries << PAGE_SHIFT))) {
@@ -324,14 +314,12 @@ struct mm_iommu_table_group_mem_t *mm_iommu_lookup(unsigned long ua,
 }
 EXPORT_SYMBOL_GPL(mm_iommu_lookup);
 
-struct mm_iommu_table_group_mem_t *mm_iommu_find(unsigned long ua,
-               unsigned long entries)
+struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
+               unsigned long ua, unsigned long entries)
 {
        struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
 
-       list_for_each_entry_rcu(mem,
-                       &current->mm->context.iommu_group_mem_list,
-                       next) {
+       list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
                if ((mem->ua == ua) && (mem->entries == entries)) {
                        ret = mem;
                        break;
@@ -373,17 +361,7 @@ void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem)
 }
 EXPORT_SYMBOL_GPL(mm_iommu_mapped_dec);
 
-void mm_iommu_init(mm_context_t *ctx)
+void mm_iommu_init(struct mm_struct *mm)
 {
-       INIT_LIST_HEAD_RCU(&ctx->iommu_group_mem_list);
-}
-
-void mm_iommu_cleanup(mm_context_t *ctx)
-{
-       struct mm_iommu_table_group_mem_t *mem, *tmp;
-
-       list_for_each_entry_safe(mem, tmp, &ctx->iommu_group_mem_list, next) {
-               list_del_rcu(&mem->next);
-               mm_iommu_do_free(mem);
-       }
+       INIT_LIST_HEAD_RCU(&mm->context.iommu_group_mem_list);
 }
index f4f437cbabf1d44447094e40da48fec9e14eaadc..ebf9782bacf97dffb1a88a71416b7235c840638e 100644 (file)
@@ -35,7 +35,8 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
 #endif
        changed = !pmd_same(*(pmdp), entry);
        if (changed) {
-               __ptep_set_access_flags(vma->vm_mm, pmdp_ptep(pmdp), pmd_pte(entry));
+               __ptep_set_access_flags(vma->vm_mm, pmdp_ptep(pmdp),
+                                       pmd_pte(entry), address);
                flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        }
        return changed;
index e93e38455d59d86f1a8895be758d2a1daa305d02..623a0dc9a9fa560353400dbe429b7701526506df 100644 (file)
@@ -177,23 +177,15 @@ redo:
 
 static void __init radix_init_partition_table(void)
 {
-       unsigned long rts_field;
+       unsigned long rts_field, dw0;
 
+       mmu_partition_table_init();
        rts_field = radix__get_tree_size();
+       dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
+       mmu_partition_table_set_entry(0, dw0, 0);
 
-       BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large.");
-       partition_tb = early_alloc_pgtable(1UL << PATB_SIZE_SHIFT);
-       partition_tb->patb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) |
-                                         RADIX_PGD_INDEX_SIZE | PATB_HR);
        pr_info("Initializing Radix MMU\n");
        pr_info("Partition table %p\n", partition_tb);
-
-       memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
-       /*
-        * update partition table control register,
-        * 64 K size.
-        */
-       mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
 }
 
 void __init radix_init_native(void)
@@ -320,6 +312,38 @@ static void update_hid_for_radix(void)
                cpu_relax();
 }
 
+static void radix_init_amor(void)
+{
+       /*
+       * In HV mode, we init AMOR (Authority Mask Override Register) so that
+       * the hypervisor and guest can setup IAMR (Instruction Authority Mask
+       * Register), enable key 0 and set it to 1.
+       *
+       * AMOR = 0b1100 .... 0000 (Mask for key 0 is 11)
+       */
+       mtspr(SPRN_AMOR, (3ul << 62));
+}
+
+static void radix_init_iamr(void)
+{
+       unsigned long iamr;
+
+       /*
+        * The IAMR should set to 0 on DD1.
+        */
+       if (cpu_has_feature(CPU_FTR_POWER9_DD1))
+               iamr = 0;
+       else
+               iamr = (1ul << 62);
+
+       /*
+        * Radix always uses key0 of the IAMR to determine if an access is
+        * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction
+        * fetch.
+        */
+       mtspr(SPRN_IAMR, iamr);
+}
+
 void __init radix__early_init_mmu(void)
 {
        unsigned long lpcr;
@@ -376,8 +400,12 @@ void __init radix__early_init_mmu(void)
                lpcr = mfspr(SPRN_LPCR);
                mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
                radix_init_partition_table();
+               radix_init_amor();
        }
 
+       memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
+
+       radix_init_iamr();
        radix_init_pgtable();
 }
 
@@ -393,7 +421,9 @@ void radix__early_init_mmu_secondary(void)
 
                mtspr(SPRN_PTCR,
                      __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
+               radix_init_amor();
        }
+       radix_init_iamr();
 }
 
 void radix__mmu_cleanup_all(void)
index 911fdfb63ec1783a89a6400c54fc97c21d6b1c11..cb39c8bd243656727f60b8975c9344766d8d79a9 100644 (file)
@@ -224,7 +224,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
        if (changed) {
                if (!is_vm_hugetlb_page(vma))
                        assert_pte_locked(vma->vm_mm, address);
-               __ptep_set_access_flags(vma->vm_mm, ptep, entry);
+               __ptep_set_access_flags(vma->vm_mm, ptep, entry, address);
                flush_tlb_page(vma, address);
        }
        return changed;
index f5e8d4edb808f8748d8eaf630f027f0d5796b344..8bca7f58afc4678a167fb05ff00b6874c062c1fd 100644 (file)
@@ -431,3 +431,37 @@ void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
        }
 }
 #endif
+
+#ifdef CONFIG_PPC_BOOK3S_64
+void __init mmu_partition_table_init(void)
+{
+       unsigned long patb_size = 1UL << PATB_SIZE_SHIFT;
+
+       BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large.");
+       partition_tb = __va(memblock_alloc_base(patb_size, patb_size,
+                                               MEMBLOCK_ALLOC_ANYWHERE));
+
+       /* Initialize the Partition Table with no entries */
+       memset((void *)partition_tb, 0, patb_size);
+
+       /*
+        * update partition table control register,
+        * 64 K size.
+        */
+       mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
+}
+
+void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
+                                  unsigned long dw1)
+{
+       partition_tb[lpid].patb0 = cpu_to_be64(dw0);
+       partition_tb[lpid].patb1 = cpu_to_be64(dw1);
+
+       /* Global flush of TLBs and partition table caches for this lpid */
+       asm volatile("ptesync" : : : "memory");
+       asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
+                    "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
+       asm volatile("eieio; tlbsync; ptesync" : : : "memory");
+}
+EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry);
+#endif /* CONFIG_PPC_BOOK3S_64 */
index bda8c43be78a4df85d067bc4a7759c63c388d29b..2822a8277f0bdda1beca60b605912af90857881a 100644 (file)
@@ -424,3 +424,21 @@ void radix__flush_tlb_all(void)
                     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(0) : "memory");
        asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
+
+void radix__flush_tlb_pte_p9_dd1(unsigned long old_pte, struct mm_struct *mm,
+                                unsigned long address)
+{
+       /*
+        * We track page size in pte only for DD1, So we can
+        * call this only on DD1.
+        */
+       if (!cpu_has_feature(CPU_FTR_POWER9_DD1)) {
+               VM_WARN_ON(1);
+               return;
+       }
+
+       if (old_pte & _PAGE_LARGE)
+               radix__flush_tlb_page_psize(mm, address, MMU_PAGE_2M);
+       else
+               radix__flush_tlb_page_psize(mm, address, mmu_virtual_psize);
+}
index 6143c99f3ec503d01423059d364577d1a5930e3a..50e598cf644b5968312c465407c68a52a3524400 100644 (file)
  */
 #include "isa207-common.h"
 
+PMU_FORMAT_ATTR(event,         "config:0-49");
+PMU_FORMAT_ATTR(pmcxsel,       "config:0-7");
+PMU_FORMAT_ATTR(mark,          "config:8");
+PMU_FORMAT_ATTR(combine,       "config:11");
+PMU_FORMAT_ATTR(unit,          "config:12-15");
+PMU_FORMAT_ATTR(pmc,           "config:16-19");
+PMU_FORMAT_ATTR(cache_sel,     "config:20-23");
+PMU_FORMAT_ATTR(sample_mode,   "config:24-28");
+PMU_FORMAT_ATTR(thresh_sel,    "config:29-31");
+PMU_FORMAT_ATTR(thresh_stop,   "config:32-35");
+PMU_FORMAT_ATTR(thresh_start,  "config:36-39");
+PMU_FORMAT_ATTR(thresh_cmp,    "config:40-49");
+
+struct attribute *isa207_pmu_format_attr[] = {
+       &format_attr_event.attr,
+       &format_attr_pmcxsel.attr,
+       &format_attr_mark.attr,
+       &format_attr_combine.attr,
+       &format_attr_unit.attr,
+       &format_attr_pmc.attr,
+       &format_attr_cache_sel.attr,
+       &format_attr_sample_mode.attr,
+       &format_attr_thresh_sel.attr,
+       &format_attr_thresh_stop.attr,
+       &format_attr_thresh_start.attr,
+       &format_attr_thresh_cmp.attr,
+       NULL,
+};
+
+struct attribute_group isa207_pmu_format_group = {
+       .name = "format",
+       .attrs = isa207_pmu_format_attr,
+};
+
 static inline bool event_is_fab_match(u64 event)
 {
        /* Only check pmc, unit and pmcxsel, ignore the edge bit (0) */
@@ -21,6 +55,48 @@ static inline bool event_is_fab_match(u64 event)
        return (event == 0x30056 || event == 0x4f052);
 }
 
+static bool is_event_valid(u64 event)
+{
+       u64 valid_mask = EVENT_VALID_MASK;
+
+       if (cpu_has_feature(CPU_FTR_ARCH_300) && !cpu_has_feature(CPU_FTR_POWER9_DD1))
+               valid_mask = p9_EVENT_VALID_MASK;
+
+       return !(event & ~valid_mask);
+}
+
+static u64 mmcra_sdar_mode(u64 event)
+{
+       if (cpu_has_feature(CPU_FTR_ARCH_300) && !cpu_has_feature(CPU_FTR_POWER9_DD1))
+               return p9_SDAR_MODE(event) << MMCRA_SDAR_MODE_SHIFT;
+
+       return MMCRA_SDAR_MODE_TLB;
+}
+
+static u64 thresh_cmp_val(u64 value)
+{
+       if (cpu_has_feature(CPU_FTR_ARCH_300) && !cpu_has_feature(CPU_FTR_POWER9_DD1))
+               return value << p9_MMCRA_THR_CMP_SHIFT;
+
+       return value << MMCRA_THR_CMP_SHIFT;
+}
+
+static unsigned long combine_from_event(u64 event)
+{
+       if (cpu_has_feature(CPU_FTR_ARCH_300) && !cpu_has_feature(CPU_FTR_POWER9_DD1))
+               return p9_EVENT_COMBINE(event);
+
+       return EVENT_COMBINE(event);
+}
+
+static unsigned long combine_shift(unsigned long pmc)
+{
+       if (cpu_has_feature(CPU_FTR_ARCH_300) && !cpu_has_feature(CPU_FTR_POWER9_DD1))
+               return p9_MMCR1_COMBINE_SHIFT(pmc);
+
+       return MMCR1_COMBINE_SHIFT(pmc);
+}
+
 int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp)
 {
        unsigned int unit, pmc, cache, ebb;
@@ -28,7 +104,7 @@ int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp)
 
        mask = value = 0;
 
-       if (event & ~EVENT_VALID_MASK)
+       if (!is_event_valid(event))
                return -1;
 
        pmc   = (event >> EVENT_PMC_SHIFT)        & EVENT_PMC_MASK;
@@ -155,15 +231,13 @@ int isa207_compute_mmcr(u64 event[], int n_ev,
                        pmc_inuse |= 1 << pmc;
        }
 
-       /* In continuous sampling mode, update SDAR on TLB miss */
-       mmcra = MMCRA_SDAR_MODE_TLB;
-       mmcr1 = mmcr2 = 0;
+       mmcra = mmcr1 = mmcr2 = 0;
 
        /* Second pass: assign PMCs, set all MMCR1 fields */
        for (i = 0; i < n_ev; ++i) {
                pmc     = (event[i] >> EVENT_PMC_SHIFT) & EVENT_PMC_MASK;
                unit    = (event[i] >> EVENT_UNIT_SHIFT) & EVENT_UNIT_MASK;
-               combine = (event[i] >> EVENT_COMBINE_SHIFT) & EVENT_COMBINE_MASK;
+               combine = combine_from_event(event[i]);
                psel    =  event[i] & EVENT_PSEL_MASK;
 
                if (!pmc) {
@@ -177,10 +251,13 @@ int isa207_compute_mmcr(u64 event[], int n_ev,
 
                if (pmc <= 4) {
                        mmcr1 |= unit << MMCR1_UNIT_SHIFT(pmc);
-                       mmcr1 |= combine << MMCR1_COMBINE_SHIFT(pmc);
+                       mmcr1 |= combine << combine_shift(pmc);
                        mmcr1 |= psel << MMCR1_PMCSEL_SHIFT(pmc);
                }
 
+               /* In continuous sampling mode, update SDAR on TLB miss */
+               mmcra |= mmcra_sdar_mode(event[i]);
+
                if (event[i] & EVENT_IS_L1) {
                        cache = event[i] >> EVENT_CACHE_SEL_SHIFT;
                        mmcr1 |= (cache & 1) << MMCR1_IC_QUAL_SHIFT;
@@ -211,7 +288,7 @@ int isa207_compute_mmcr(u64 event[], int n_ev,
                        val = (event[i] >> EVENT_THR_SEL_SHIFT) & EVENT_THR_SEL_MASK;
                        mmcra |= val << MMCRA_THR_SEL_SHIFT;
                        val = (event[i] >> EVENT_THR_CMP_SHIFT) & EVENT_THR_CMP_MASK;
-                       mmcra |= val << MMCRA_THR_CMP_SHIFT;
+                       mmcra |= thresh_cmp_val(val);
                }
 
                if (event[i] & EVENT_WANTS_BHRB) {
index 4d0a4e5017c20377fc0ebd9e5bd286d6a26ee74b..90495f1580c7d9f2001a0e9532a723fbea6b938c 100644 (file)
 #define EVENT_UNIT_MASK                0xf
 #define EVENT_COMBINE_SHIFT    11      /* Combine bit */
 #define EVENT_COMBINE_MASK     0x1
+#define EVENT_COMBINE(v)       (((v) >> EVENT_COMBINE_SHIFT) & EVENT_COMBINE_MASK)
 #define EVENT_MARKED_SHIFT     8       /* Marked bit */
 #define EVENT_MARKED_MASK      0x1
 #define EVENT_IS_MARKED                (EVENT_MARKED_MASK << EVENT_MARKED_SHIFT)
         PERF_SAMPLE_BRANCH_KERNEL      |\
         PERF_SAMPLE_BRANCH_HV)
 
+/* Contants to support power9 raw encoding format */
+#define p9_EVENT_COMBINE_SHIFT 10      /* Combine bit */
+#define p9_EVENT_COMBINE_MASK  0x3ull
+#define p9_EVENT_COMBINE(v)    (((v) >> p9_EVENT_COMBINE_SHIFT) & p9_EVENT_COMBINE_MASK)
+#define p9_SDAR_MODE_SHIFT     50
+#define p9_SDAR_MODE_MASK      0x3ull
+#define p9_SDAR_MODE(v)                (((v) >> p9_SDAR_MODE_SHIFT) & p9_SDAR_MODE_MASK)
+
+#define p9_EVENT_VALID_MASK            \
+       ((p9_SDAR_MODE_MASK   << p9_SDAR_MODE_SHIFT             |       \
+       (EVENT_THRESH_MASK    << EVENT_THRESH_SHIFT)            |       \
+       (EVENT_SAMPLE_MASK    << EVENT_SAMPLE_SHIFT)            |       \
+       (EVENT_CACHE_SEL_MASK << EVENT_CACHE_SEL_SHIFT)         |       \
+       (EVENT_PMC_MASK       << EVENT_PMC_SHIFT)               |       \
+       (EVENT_UNIT_MASK      << EVENT_UNIT_SHIFT)              |       \
+       (p9_EVENT_COMBINE_MASK << p9_EVENT_COMBINE_SHIFT)       |       \
+       (EVENT_MARKED_MASK    << EVENT_MARKED_SHIFT)            |       \
+        EVENT_LINUX_MASK                                       |       \
+        EVENT_PSEL_MASK))
+
 /*
  * Layout of constraint bits:
  *
 #define MMCR1_DC_QUAL_SHIFT            47
 #define MMCR1_IC_QUAL_SHIFT            46
 
+/* MMCR1 Combine bits macro for power9 */
+#define p9_MMCR1_COMBINE_SHIFT(pmc)    (38 - ((pmc - 1) * 2))
+
 /* Bits in MMCRA for PowerISA v2.07 */
 #define MMCRA_SAMP_MODE_SHIFT          1
 #define MMCRA_SAMP_ELIG_SHIFT          4
 #define MMCRA_THR_CTL_SHIFT            8
 #define MMCRA_THR_SEL_SHIFT            16
 #define MMCRA_THR_CMP_SHIFT            32
-#define MMCRA_SDAR_MODE_TLB            (1ull << 42)
+#define MMCRA_SDAR_MODE_SHIFT          42
+#define MMCRA_SDAR_MODE_TLB            (1ull << MMCRA_SDAR_MODE_SHIFT)
 #define MMCRA_IFM_SHIFT                        30
 
+/* MMCR1 Threshold Compare bit constant for power9 */
+#define p9_MMCRA_THR_CMP_SHIFT 45
+
 /* Bits in MMCR2 for PowerISA v2.07 */
 #define MMCR2_FCS(pmc)                 (1ull << (63 - (((pmc) - 1) * 9)))
 #define MMCR2_FCP(pmc)                 (1ull << (62 - (((pmc) - 1) * 9)))
index ab830d106ec57ff48d703c555c9c1f49eba2b004..d07186382f3a75b147bfe582563f98341f50ea7e 100644 (file)
@@ -30,6 +30,9 @@ enum {
 #define        POWER8_MMCRA_IFM2               0x0000000080000000UL
 #define        POWER8_MMCRA_IFM3               0x00000000C0000000UL
 
+/* PowerISA v2.07 format attribute structure*/
+extern struct attribute_group isa207_pmu_format_group;
+
 /* Table of alternatives, sorted by column 0 */
 static const unsigned int event_alternatives[][MAX_ALT] = {
        { PM_MRK_ST_CMPL,               PM_MRK_ST_CMPL_ALT },
@@ -175,42 +178,8 @@ static struct attribute_group power8_pmu_events_group = {
        .attrs = power8_events_attr,
 };
 
-PMU_FORMAT_ATTR(event,         "config:0-49");
-PMU_FORMAT_ATTR(pmcxsel,       "config:0-7");
-PMU_FORMAT_ATTR(mark,          "config:8");
-PMU_FORMAT_ATTR(combine,       "config:11");
-PMU_FORMAT_ATTR(unit,          "config:12-15");
-PMU_FORMAT_ATTR(pmc,           "config:16-19");
-PMU_FORMAT_ATTR(cache_sel,     "config:20-23");
-PMU_FORMAT_ATTR(sample_mode,   "config:24-28");
-PMU_FORMAT_ATTR(thresh_sel,    "config:29-31");
-PMU_FORMAT_ATTR(thresh_stop,   "config:32-35");
-PMU_FORMAT_ATTR(thresh_start,  "config:36-39");
-PMU_FORMAT_ATTR(thresh_cmp,    "config:40-49");
-
-static struct attribute *power8_pmu_format_attr[] = {
-       &format_attr_event.attr,
-       &format_attr_pmcxsel.attr,
-       &format_attr_mark.attr,
-       &format_attr_combine.attr,
-       &format_attr_unit.attr,
-       &format_attr_pmc.attr,
-       &format_attr_cache_sel.attr,
-       &format_attr_sample_mode.attr,
-       &format_attr_thresh_sel.attr,
-       &format_attr_thresh_stop.attr,
-       &format_attr_thresh_start.attr,
-       &format_attr_thresh_cmp.attr,
-       NULL,
-};
-
-static struct attribute_group power8_pmu_format_group = {
-       .name = "format",
-       .attrs = power8_pmu_format_attr,
-};
-
 static const struct attribute_group *power8_pmu_attr_groups[] = {
-       &power8_pmu_format_group,
+       &isa207_pmu_format_group,
        &power8_pmu_events_group,
        NULL,
 };
index 8e9a81967ff8516b8c04d47e65df92c7daeeba1c..346010e8d463d36d2c411839ecb550d9534f61b1 100644 (file)
 
 #include "isa207-common.h"
 
+/*
+ * Raw event encoding for Power9:
+ *
+ *        60        56        52        48        44        40        36        32
+ * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
+ *   | | [ ]                       [ ] [      thresh_cmp     ]   [  thresh_ctl   ]
+ *   | |  |                         |                                     |
+ *   | |  *- IFM (Linux)            |    thresh start/stop OR FAB match -*
+ *   | *- BHRB (Linux)              *sm
+ *   *- EBB (Linux)
+ *
+ *        28        24        20        16        12         8         4         0
+ * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - |
+ *   [   ] [  sample ]   [cache]   [ pmc ]   [unit ]   []    m   [    pmcxsel    ]
+ *     |        |           |                          |     |
+ *     |        |           |                          |     *- mark
+ *     |        |           *- L1/L2/L3 cache_sel      |
+ *     |        |                                      |
+ *     |        *- sampling mode for marked events     *- combine
+ *     |
+ *     *- thresh_sel
+ *
+ * Below uses IBM bit numbering.
+ *
+ * MMCR1[x:y] = unit    (PMCxUNIT)
+ * MMCR1[24]   = pmc1combine[0]
+ * MMCR1[25]   = pmc1combine[1]
+ * MMCR1[26]   = pmc2combine[0]
+ * MMCR1[27]   = pmc2combine[1]
+ * MMCR1[28]   = pmc3combine[0]
+ * MMCR1[29]   = pmc3combine[1]
+ * MMCR1[30]   = pmc4combine[0]
+ * MMCR1[31]   = pmc4combine[1]
+ *
+ * if pmc == 3 and unit == 0 and pmcxsel[0:6] == 0b0101011
+ *     # PM_MRK_FAB_RSP_MATCH
+ *     MMCR1[20:27] = thresh_ctl   (FAB_CRESP_MATCH / FAB_TYPE_MATCH)
+ * else if pmc == 4 and unit == 0xf and pmcxsel[0:6] == 0b0101001
+ *     # PM_MRK_FAB_RSP_MATCH_CYC
+ *     MMCR1[20:27] = thresh_ctl   (FAB_CRESP_MATCH / FAB_TYPE_MATCH)
+ * else
+ *     MMCRA[48:55] = thresh_ctl   (THRESH START/END)
+ *
+ * if thresh_sel:
+ *     MMCRA[45:47] = thresh_sel
+ *
+ * if thresh_cmp:
+ *     MMCRA[9:11] = thresh_cmp[0:2]
+ *     MMCRA[12:18] = thresh_cmp[3:9]
+ *
+ * if unit == 6 or unit == 7
+ *     MMCRC[53:55] = cache_sel[1:3]      (L2EVENT_SEL)
+ * else if unit == 8 or unit == 9:
+ *     if cache_sel[0] == 0: # L3 bank
+ *             MMCRC[47:49] = cache_sel[1:3]  (L3EVENT_SEL0)
+ *     else if cache_sel[0] == 1:
+ *             MMCRC[50:51] = cache_sel[2:3]  (L3EVENT_SEL1)
+ * else if cache_sel[1]: # L1 event
+ *     MMCR1[16] = cache_sel[2]
+ *    MMCR1[17] = cache_sel[3]
+ *
+ * if mark:
+ *     MMCRA[63]    = 1                (SAMPLE_ENABLE)
+ *     MMCRA[57:59] = sample[0:2]      (RAND_SAMP_ELIG)
+ *    MMCRA[61:62] = sample[3:4]      (RAND_SAMP_MODE)
+ *
+ * if EBB and BHRB:
+ *     MMCRA[32:33] = IFM
+ *
+ * MMCRA[SDAR_MODE]  = sm
+ */
+
 /*
  * Some power9 event codes.
  */
@@ -31,6 +103,9 @@ enum {
 #define POWER9_MMCRA_IFM2              0x0000000080000000UL
 #define POWER9_MMCRA_IFM3              0x00000000C0000000UL
 
+/* PowerISA v2.07 format attribute structure*/
+extern struct attribute_group isa207_pmu_format_group;
+
 GENERIC_EVENT_ATTR(cpu-cycles,                 PM_CYC);
 GENERIC_EVENT_ATTR(stalled-cycles-frontend,    PM_ICT_NOSLOT_CYC);
 GENERIC_EVENT_ATTR(stalled-cycles-backend,     PM_CMPLU_STALL);
@@ -90,10 +165,16 @@ static struct attribute_group power9_pmu_events_group = {
        .attrs = power9_events_attr,
 };
 
-PMU_FORMAT_ATTR(event,         "config:0-49");
+static const struct attribute_group *power9_isa207_pmu_attr_groups[] = {
+       &isa207_pmu_format_group,
+       &power9_pmu_events_group,
+       NULL,
+};
+
+PMU_FORMAT_ATTR(event,         "config:0-51");
 PMU_FORMAT_ATTR(pmcxsel,       "config:0-7");
 PMU_FORMAT_ATTR(mark,          "config:8");
-PMU_FORMAT_ATTR(combine,       "config:11");
+PMU_FORMAT_ATTR(combine,       "config:10-11");
 PMU_FORMAT_ATTR(unit,          "config:12-15");
 PMU_FORMAT_ATTR(pmc,           "config:16-19");
 PMU_FORMAT_ATTR(cache_sel,     "config:20-23");
@@ -102,6 +183,7 @@ PMU_FORMAT_ATTR(thresh_sel, "config:29-31");
 PMU_FORMAT_ATTR(thresh_stop,   "config:32-35");
 PMU_FORMAT_ATTR(thresh_start,  "config:36-39");
 PMU_FORMAT_ATTR(thresh_cmp,    "config:40-49");
+PMU_FORMAT_ATTR(sdar_mode,     "config:50-51");
 
 static struct attribute *power9_pmu_format_attr[] = {
        &format_attr_event.attr,
@@ -116,6 +198,7 @@ static struct attribute *power9_pmu_format_attr[] = {
        &format_attr_thresh_stop.attr,
        &format_attr_thresh_start.attr,
        &format_attr_thresh_cmp.attr,
+       &format_attr_sdar_mode.attr,
        NULL,
 };
 
@@ -291,6 +374,24 @@ static int power9_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 
 #undef C
 
+static struct power_pmu power9_isa207_pmu = {
+       .name                   = "POWER9",
+       .n_counter              = MAX_PMU_COUNTERS,
+       .add_fields             = ISA207_ADD_FIELDS,
+       .test_adder             = ISA207_TEST_ADDER,
+       .compute_mmcr           = isa207_compute_mmcr,
+       .config_bhrb            = power9_config_bhrb,
+       .bhrb_filter_map        = power9_bhrb_filter_map,
+       .get_constraint         = isa207_get_constraint,
+       .disable_pmc            = isa207_disable_pmc,
+       .flags                  = PPMU_HAS_SIER | PPMU_ARCH_207S,
+       .n_generic              = ARRAY_SIZE(power9_generic_events),
+       .generic_events         = power9_generic_events,
+       .cache_events           = &power9_cache_events,
+       .attr_groups            = power9_isa207_pmu_attr_groups,
+       .bhrb_nr                = 32,
+};
+
 static struct power_pmu power9_pmu = {
        .name                   = "POWER9",
        .n_counter              = MAX_PMU_COUNTERS,
@@ -311,14 +412,19 @@ static struct power_pmu power9_pmu = {
 
 static int __init init_power9_pmu(void)
 {
-       int rc;
+       int rc = 0;
 
        /* Comes from cpu_specs[] */
        if (!cur_cpu_spec->oprofile_cpu_type ||
            strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power9"))
                return -ENODEV;
 
-       rc = register_power_pmu(&power9_pmu);
+       if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
+               rc = register_power_pmu(&power9_isa207_pmu);
+       } else {
+               rc = register_power_pmu(&power9_pmu);
+       }
+
        if (rc)
                return rc;
 
index e3257f24a8a19b28248fd3248590536ef25d812d..f8d1410aa5bb364f5d2966ebe2f21ddf67fa8159 100644 (file)
@@ -102,18 +102,18 @@ config 405GP
        bool
        select IBM405_ERR77
        select IBM405_ERR51
-       select IBM_EMAC_ZMII
+       select IBM_EMAC_ZMII if IBM_EMAC
 
 config 405EX
        bool
-       select IBM_EMAC_EMAC4
-       select IBM_EMAC_RGMII
+       select IBM_EMAC_EMAC4 if IBM_EMAC
+       select IBM_EMAC_RGMII if IBM_EMAC
 
 config 405EZ
        bool
-       select IBM_EMAC_NO_FLOW_CTRL
-       select IBM_EMAC_MAL_CLR_ICINTSTAT
-       select IBM_EMAC_MAL_COMMON_ERR
+       select IBM_EMAC_NO_FLOW_CTRL if IBM_EMAC
+       select IBM_EMAC_MAL_CLR_ICINTSTAT if IBM_EMAC
+       select IBM_EMAC_MAL_COMMON_ERR if IBM_EMAC
 
 config XILINX_VIRTEX
        bool
index 48fc18041ff681188a9b9fe4495bcf795b6b59f6..8d18669856f91be3e6182dd2908ecc9a07c2f007 100644 (file)
@@ -26,7 +26,7 @@ config BLUESTONE
        select PCI_MSI
        select PPC4xx_MSI
        select PPC4xx_PCI_EXPRESS
-       select IBM_EMAC_RGMII
+       select IBM_EMAC_RGMII if IBM_EMAC
        help
          This option enables support for the APM APM821xx Evaluation board.
 
@@ -125,8 +125,8 @@ config CANYONLANDS
        select PPC4xx_PCI_EXPRESS
        select PCI_MSI
        select PPC4xx_MSI
-       select IBM_EMAC_RGMII
-       select IBM_EMAC_ZMII
+       select IBM_EMAC_RGMII if IBM_EMAC
+       select IBM_EMAC_ZMII if IBM_EMAC
        help
          This option enables support for the AMCC PPC460EX evaluation board.
 
@@ -138,8 +138,8 @@ config GLACIER
        select 460EX # Odd since it uses 460GT but the effects are the same
        select PCI
        select PPC4xx_PCI_EXPRESS
-       select IBM_EMAC_RGMII
-       select IBM_EMAC_ZMII
+       select IBM_EMAC_RGMII if IBM_EMAC
+       select IBM_EMAC_ZMII if IBM_EMAC
        help
          This option enables support for the AMCC PPC460GT evaluation board.
 
@@ -164,7 +164,7 @@ config EIGER
        select 460SX
        select PCI
        select PPC4xx_PCI_EXPRESS
-       select IBM_EMAC_RGMII
+       select IBM_EMAC_RGMII if IBM_EMAC
        help
          This option enables support for the AMCC PPC460SX evaluation board.
 
@@ -213,7 +213,7 @@ config AKEBONO
        select NETDEVICES
        select ETHERNET
        select NET_VENDOR_IBM
-       select IBM_EMAC_EMAC4
+       select IBM_EMAC_EMAC4 if IBM_EMAC
        select USB if USB_SUPPORT
        select USB_OHCI_HCD_PLATFORM if USB_OHCI_HCD
        select USB_EHCI_HCD_PLATFORM if USB_EHCI_HCD
@@ -290,54 +290,54 @@ config 440EP
        bool
        select PPC_FPU
        select IBM440EP_ERR42
-       select IBM_EMAC_ZMII
+       select IBM_EMAC_ZMII if IBM_EMAC
 
 config 440EPX
        bool
        select PPC_FPU
-       select IBM_EMAC_EMAC4
-       select IBM_EMAC_RGMII
-       select IBM_EMAC_ZMII
+       select IBM_EMAC_EMAC4 if IBM_EMAC
+       select IBM_EMAC_RGMII if IBM_EMAC
+       select IBM_EMAC_ZMII if IBM_EMAC
        select USB_EHCI_BIG_ENDIAN_MMIO
        select USB_EHCI_BIG_ENDIAN_DESC
 
 config 440GRX
        bool
-       select IBM_EMAC_EMAC4
-       select IBM_EMAC_RGMII
-       select IBM_EMAC_ZMII
+       select IBM_EMAC_EMAC4 if IBM_EMAC
+       select IBM_EMAC_RGMII if IBM_EMAC
+       select IBM_EMAC_ZMII if IBM_EMAC
 
 config 440GP
        bool
-       select IBM_EMAC_ZMII
+       select IBM_EMAC_ZMII if IBM_EMAC
 
 config 440GX
        bool
-       select IBM_EMAC_EMAC4
-       select IBM_EMAC_RGMII
-       select IBM_EMAC_ZMII #test only
-       select IBM_EMAC_TAH  #test only
+       select IBM_EMAC_EMAC4 if IBM_EMAC
+       select IBM_EMAC_RGMII if IBM_EMAC
+       select IBM_EMAC_ZMII if IBM_EMAC #test only
+       select IBM_EMAC_TAH if IBM_EMAC  #test only
 
 config 440SP
        bool
 
 config 440SPe
        bool
-       select IBM_EMAC_EMAC4
+       select IBM_EMAC_EMAC4 if IBM_EMAC
 
 config 460EX
        bool
        select PPC_FPU
-       select IBM_EMAC_EMAC4
-       select IBM_EMAC_TAH
+       select IBM_EMAC_EMAC4 if IBM_EMAC
+       select IBM_EMAC_TAH if IBM_EMAC
 
 config 460SX
        bool
        select PPC_FPU
-       select IBM_EMAC_EMAC4
-       select IBM_EMAC_RGMII
-       select IBM_EMAC_ZMII
-       select IBM_EMAC_TAH
+       select IBM_EMAC_EMAC4 if IBM_EMAC
+       select IBM_EMAC_RGMII if IBM_EMAC
+       select IBM_EMAC_ZMII if IBM_EMAC
+       select IBM_EMAC_TAH if IBM_EMAC
 
 config 476FPE
        bool
@@ -346,8 +346,8 @@ config 476FPE
 config APM821xx
        bool
        select PPC_FPU
-       select IBM_EMAC_EMAC4
-       select IBM_EMAC_TAH
+       select IBM_EMAC_EMAC4 if IBM_EMAC
+       select IBM_EMAC_TAH if IBM_EMAC
 
 config 476FPE_ERR46
        depends on 476FPE
index 1179115a4b5c64aff563af4699e645b311b0e979..3803b0addf657edafae38ee217136db69a5db900 100644 (file)
@@ -220,7 +220,7 @@ define_machine(corenet_generic) {
  *
  * Likewise, problems have been seen with kexec when coreint is enabled.
  */
-#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_KEXEC)
+#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_KEXEC_CORE)
        .get_irq                = mpic_get_irq,
 #else
        .get_irq                = mpic_get_coreint_irq,
index fe9f19e5e935f538dec748c40390e41d2313ff09..a83a6d26090d1dbc8356f9d15d9e6401931ab3e3 100644 (file)
@@ -349,13 +349,13 @@ struct smp_ops_t smp_85xx_ops = {
        .cpu_disable    = generic_cpu_disable,
        .cpu_die        = generic_cpu_die,
 #endif
-#if defined(CONFIG_KEXEC) && !defined(CONFIG_PPC64)
+#if defined(CONFIG_KEXEC_CORE) && !defined(CONFIG_PPC64)
        .give_timebase  = smp_generic_give_timebase,
        .take_timebase  = smp_generic_take_timebase,
 #endif
 };
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 #ifdef CONFIG_PPC32
 atomic_t kexec_down_cpus = ATOMIC_INIT(0);
 
@@ -458,7 +458,7 @@ static void mpc85xx_smp_machine_kexec(struct kimage *image)
 
        default_machine_kexec(image);
 }
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
 
 static void smp_85xx_basic_setup(int cpu_nr)
 {
@@ -512,7 +512,7 @@ void __init mpc85xx_smp_init(void)
 #endif
        smp_ops = &smp_85xx_ops;
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        ppc_md.kexec_cpu_down = mpc85xx_smp_kexec_cpu_down;
        ppc_md.machine_kexec = mpc85xx_smp_machine_kexec;
 #endif
index fbdae8377b71452cf2afb8b2e24e36a5d9ccd9b0..7e3a2ebba29b72f2bea3bc5413ae750be7138313 100644 (file)
@@ -168,17 +168,6 @@ config MPIC_BROKEN_REGREAD
          well, but enabling it uses about 8KB of memory to keep copies
          of the register contents in software.
 
-config IBMVIO
-       depends on PPC_PSERIES
-       bool
-       default y
-
-config IBMEBUS
-       depends on PPC_PSERIES
-       bool "Support for GX bus based adapters"
-       help
-         Bus device driver for GX bus based adapters.
-
 config EEH
        bool
        depends on (PPC_POWERNV || PPC_PSERIES) && PCI
index d9088f0b8fcc56f76f0e1f29787a83864efbcbc9..a4522f09d65ecee59b225204e16e40b7589c8893 100644 (file)
@@ -17,10 +17,10 @@ config PPC_CELL_NATIVE
        select PPC_CELL_COMMON
        select MPIC
        select PPC_IO_WORKAROUNDS
-       select IBM_EMAC_EMAC4
-       select IBM_EMAC_RGMII
-       select IBM_EMAC_ZMII #test only
-       select IBM_EMAC_TAH  #test only
+       select IBM_EMAC_EMAC4 if IBM_EMAC
+       select IBM_EMAC_RGMII if IBM_EMAC
+       select IBM_EMAC_ZMII if IBM_EMAC #test only
+       select IBM_EMAC_TAH if IBM_EMAC  #test only
        default n
 
 config PPC_IBM_CELL_BLADE
@@ -46,7 +46,6 @@ config SPU_FS
        default m
        depends on PPC_CELL
        select SPU_BASE
-       select MEMORY_HOTPLUG
        help
          The SPU file system is used to access Synergistic Processing
          Units on machines implementing the Broadband Processor
index e84d8fbc2e21ddbee1a73e1f87cf1f06cc9a11f7..96c2b8a406303194737962905330c0a7a3900944 100644 (file)
@@ -676,7 +676,7 @@ static ssize_t spu_stat_show(struct device *dev,
 
 static DEVICE_ATTR(stat, 0444, spu_stat_show, NULL);
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 
 struct crash_spu_info {
        struct spu *spu;
index 2354ea51e871428211ef13fee8f976844ab1f32c..6fb5522acd7085c3ffdd003e84637050dcb92146 100644 (file)
@@ -393,7 +393,7 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
        /* Create PE */
        ret = eeh_add_to_parent_pe(edev);
        if (ret) {
-               pr_warn("%s: Can't add PCI dev %04x:%02x:%02x.%01x to parent PE (%d)\n",
+               pr_warn("%s: Can't add PCI dev %04x:%02x:%02x.%01x to parent PE (%x)\n",
                        __func__, hose->global_number, pdn->busno,
                        PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn), ret);
                return NULL;
@@ -1097,7 +1097,7 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option)
 
        bus = eeh_pe_bus_get(pe);
        if (!bus) {
-               pr_err("%s: Cannot find PCI bus for PHB#%d-PE#%x\n",
+               pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n",
                        __func__, pe->phb->global_number, pe->addr);
                return -EIO;
        }
index aec85e778028232fdd275bd318bbba7c7ff40fd4..73b155fd4481595d6763929f76ea8df344e3b6b2 100644 (file)
@@ -263,7 +263,7 @@ static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe)
        /* Enable the bypass window */
 
        top = roundup_pow_of_two(top);
-       dev_info(&npe->pdev->dev, "Enabling bypass for PE %d\n",
+       dev_info(&npe->pdev->dev, "Enabling bypass for PE %x\n",
                        npe->pe_number);
        rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
                        npe->pe_number, npe->pe_number,
index 44d2d842cee79009f012842dcfbda7a1896b00f4..3aa40f1b20f510647acf6aa64e7e74f7e6af137b 100644 (file)
@@ -304,8 +304,11 @@ OPAL_CALL(opal_pci_get_presence_state,             OPAL_PCI_GET_PRESENCE_STATE);
 OPAL_CALL(opal_pci_get_power_state,            OPAL_PCI_GET_POWER_STATE);
 OPAL_CALL(opal_pci_set_power_state,            OPAL_PCI_SET_POWER_STATE);
 OPAL_CALL(opal_int_get_xirr,                   OPAL_INT_GET_XIRR);
+OPAL_CALL_REAL(opal_rm_int_get_xirr,           OPAL_INT_GET_XIRR);
 OPAL_CALL(opal_int_set_cppr,                   OPAL_INT_SET_CPPR);
 OPAL_CALL(opal_int_eoi,                                OPAL_INT_EOI);
+OPAL_CALL_REAL(opal_rm_int_eoi,                        OPAL_INT_EOI);
 OPAL_CALL(opal_int_set_mfrr,                   OPAL_INT_SET_MFRR);
+OPAL_CALL_REAL(opal_rm_int_set_mfrr,           OPAL_INT_SET_MFRR);
 OPAL_CALL(opal_pci_tce_kill,                   OPAL_PCI_TCE_KILL);
 OPAL_CALL_REAL(opal_rm_pci_tce_kill,           OPAL_PCI_TCE_KILL);
index 893d8ea995aa2fe0d02e518b7dc3af05b4144684..282293572dc82a61ed9c39a607bd69cf47910d5b 100644 (file)
@@ -886,3 +886,5 @@ EXPORT_SYMBOL_GPL(opal_leds_get_ind);
 EXPORT_SYMBOL_GPL(opal_leds_set_ind);
 /* Export this symbol for PowerNV Operator Panel class driver */
 EXPORT_SYMBOL_GPL(opal_write_oppanel_async);
+/* Export this for KVM */
+EXPORT_SYMBOL_GPL(opal_int_set_mfrr);
index d4b33dd2d9e740d789da5768957beefcafd7f997..b07680cd251820c6b38a41a878d7598d2a02b05b 100644 (file)
@@ -83,7 +83,7 @@ void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
                        PCI_SLOT(pe->rid), PCI_FUNC(pe->rid));
 #endif /* CONFIG_PCI_IOV*/
 
-       printk("%spci %s: [PE# %.3d] %pV",
+       printk("%spci %s: [PE# %.2x] %pV",
               level, pfix, pe->pe_number, &vaf);
 
        va_end(args);
@@ -145,8 +145,8 @@ static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no)
         */
        rc = opal_pci_eeh_freeze_clear(phb->opal_id, pe_no,
                                       OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
-       if (rc != OPAL_SUCCESS)
-               pr_warn("%s: Error %lld unfreezing PHB#%d-PE#%d\n",
+       if (rc != OPAL_SUCCESS && rc != OPAL_UNSUPPORTED)
+               pr_warn("%s: Error %lld unfreezing PHB#%x-PE#%x\n",
                        __func__, rc, phb->hose->global_number, pe_no);
 
        return &phb->ioda.pe_array[pe_no];
@@ -155,13 +155,13 @@ static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no)
 static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
 {
        if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe_num)) {
-               pr_warn("%s: Invalid PE %d on PHB#%x\n",
+               pr_warn("%s: Invalid PE %x on PHB#%x\n",
                        __func__, pe_no, phb->hose->global_number);
                return;
        }
 
        if (test_and_set_bit(pe_no, phb->ioda.pe_alloc))
-               pr_debug("%s: PE %d was reserved on PHB#%x\n",
+               pr_debug("%s: PE %x was reserved on PHB#%x\n",
                         __func__, pe_no, phb->hose->global_number);
 
        pnv_ioda_init_pe(phb, pe_no);
@@ -229,7 +229,7 @@ static int pnv_ioda2_init_m64(struct pnv_phb *phb)
        else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1))
                r->end -= (2 * phb->ioda.m64_segsize);
        else
-               pr_warn("  Cannot strip M64 segment for reserved PE#%d\n",
+               pr_warn("  Cannot strip M64 segment for reserved PE#%x\n",
                        phb->ioda.reserved_pe_idx);
 
        return 0;
@@ -291,7 +291,7 @@ static int pnv_ioda1_init_m64(struct pnv_phb *phb)
                                OPAL_M64_WINDOW_TYPE, index, base, 0,
                                PNV_IODA1_M64_SEGS * segsz);
                if (rc != OPAL_SUCCESS) {
-                       pr_warn("  Error %lld setting M64 PHB#%d-BAR#%d\n",
+                       pr_warn("  Error %lld setting M64 PHB#%x-BAR#%d\n",
                                rc, phb->hose->global_number, index);
                        goto fail;
                }
@@ -300,7 +300,7 @@ static int pnv_ioda1_init_m64(struct pnv_phb *phb)
                                OPAL_M64_WINDOW_TYPE, index,
                                OPAL_ENABLE_M64_SPLIT);
                if (rc != OPAL_SUCCESS) {
-                       pr_warn("  Error %lld enabling M64 PHB#%d-BAR#%d\n",
+                       pr_warn("  Error %lld enabling M64 PHB#%x-BAR#%d\n",
                                rc, phb->hose->global_number, index);
                        goto fail;
                }
@@ -316,7 +316,7 @@ static int pnv_ioda1_init_m64(struct pnv_phb *phb)
        else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1))
                r->end -= (2 * phb->ioda.m64_segsize);
        else
-               WARN(1, "Wrong reserved PE#%d on PHB#%d\n",
+               WARN(1, "Wrong reserved PE#%x on PHB#%x\n",
                     phb->ioda.reserved_pe_idx, phb->hose->global_number);
 
        return 0;
@@ -414,7 +414,7 @@ static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all)
                                        pe->pe_number / PNV_IODA1_M64_SEGS,
                                        pe->pe_number % PNV_IODA1_M64_SEGS);
                        if (rc != OPAL_SUCCESS)
-                               pr_warn("%s: Error %lld mapping M64 for PHB#%d-PE#%d\n",
+                               pr_warn("%s: Error %lld mapping M64 for PHB#%x-PE#%x\n",
                                        __func__, rc, phb->hose->global_number,
                                        pe->pe_number);
                }
@@ -941,14 +941,14 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
        pe->mve_number = pe->pe_number;
        rc = opal_pci_set_mve(phb->opal_id, pe->mve_number, pe->pe_number);
        if (rc != OPAL_SUCCESS) {
-               pe_err(pe, "OPAL error %ld setting up MVE %d\n",
+               pe_err(pe, "OPAL error %ld setting up MVE %x\n",
                       rc, pe->mve_number);
                pe->mve_number = -1;
        } else {
                rc = opal_pci_set_mve_enable(phb->opal_id,
                                             pe->mve_number, OPAL_ENABLE_MVE);
                if (rc) {
-                       pe_err(pe, "OPAL error %ld enabling MVE %d\n",
+                       pe_err(pe, "OPAL error %ld enabling MVE %x\n",
                               rc, pe->mve_number);
                        pe->mve_number = -1;
                }
@@ -1159,10 +1159,10 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
        pe->rid = bus->busn_res.start << 8;
 
        if (all)
-               pe_info(pe, "Secondary bus %d..%d associated with PE#%d\n",
+               pe_info(pe, "Secondary bus %d..%d associated with PE#%x\n",
                        bus->busn_res.start, bus->busn_res.end, pe->pe_number);
        else
-               pe_info(pe, "Secondary bus %d associated with PE#%d\n",
+               pe_info(pe, "Secondary bus %d associated with PE#%x\n",
                        bus->busn_res.start, pe->pe_number);
 
        if (pnv_ioda_configure_pe(phb, pe)) {
@@ -1213,7 +1213,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev)
                         * peer NPU.
                         */
                        dev_info(&npu_pdev->dev,
-                               "Associating to existing PE %d\n", pe_num);
+                               "Associating to existing PE %x\n", pe_num);
                        pci_dev_get(npu_pdev);
                        npu_pdn = pci_get_pdn(npu_pdev);
                        rid = npu_pdev->bus->number << 8 | npu_pdn->devfn;
@@ -1539,7 +1539,7 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
                pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) |
                           pci_iov_virtfn_devfn(pdev, vf_index);
 
-               pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%d\n",
+               pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n",
                        hose->global_number, pdev->bus->number,
                        PCI_SLOT(pci_iov_virtfn_devfn(pdev, vf_index)),
                        PCI_FUNC(pci_iov_virtfn_devfn(pdev, vf_index)), pe_num);
@@ -2844,7 +2844,7 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
        pnv_set_msi_irq_chip(phb, virq);
 
        pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d),"
-                " address=%x_%08x data=%x PE# %d\n",
+                " address=%x_%08x data=%x PE# %x\n",
                 pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num,
                 msg->address_hi, msg->address_lo, data, pe->pe_number);
 
@@ -2993,7 +2993,7 @@ static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
                        rc = opal_pci_map_pe_mmio_window(phb->opal_id,
                                pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index);
                        if (rc != OPAL_SUCCESS) {
-                               pr_err("%s: Error %lld mapping IO segment#%d to PE#%d\n",
+                               pr_err("%s: Error %lld mapping IO segment#%d to PE#%x\n",
                                       __func__, rc, index, pe->pe_number);
                                break;
                        }
@@ -3017,7 +3017,7 @@ static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
                        rc = opal_pci_map_pe_mmio_window(phb->opal_id,
                                pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index);
                        if (rc != OPAL_SUCCESS) {
-                               pr_err("%s: Error %lld mapping M32 segment#%d to PE#%d",
+                               pr_err("%s: Error %lld mapping M32 segment#%d to PE#%x",
                                       __func__, rc, index, pe->pe_number);
                                break;
                        }
@@ -3281,7 +3281,7 @@ static void pnv_pci_setup_bridge(struct pci_bus *bus, unsigned long type)
                pnv_pci_ioda2_setup_dma_pe(phb, pe);
                break;
        default:
-               pr_warn("%s: No DMA for PHB#%d (type %d)\n",
+               pr_warn("%s: No DMA for PHB#%x (type %d)\n",
                        __func__, phb->hose->global_number, phb->type);
        }
 }
index db7b8020f68efb1ba07bfc3faf380e78f402e872..c6d554fe585c65601965f775c9720ae6f6b406e5 100644 (file)
@@ -234,7 +234,7 @@ static void pnv_pci_dump_p7ioc_diag_data(struct pci_controller *hose,
        int i;
 
        data = (struct OpalIoP7IOCPhbErrorData *)common;
-       pr_info("P7IOC PHB#%d Diag-data (Version: %d)\n",
+       pr_info("P7IOC PHB#%x Diag-data (Version: %d)\n",
                hose->global_number, be32_to_cpu(common->version));
 
        if (data->brdgCtl)
@@ -326,7 +326,7 @@ static void pnv_pci_dump_phb3_diag_data(struct pci_controller *hose,
        int i;
 
        data = (struct OpalIoPhb3ErrorData*)common;
-       pr_info("PHB3 PHB#%d Diag-data (Version: %d)\n",
+       pr_info("PHB3 PHB#%x Diag-data (Version: %d)\n",
                hose->global_number, be32_to_cpu(common->version));
        if (data->brdgCtl)
                pr_info("brdgCtl:     %08x\n",
@@ -516,7 +516,7 @@ static void pnv_pci_config_check_eeh(struct pci_dn *pdn)
                }
        }
 
-       pr_devel(" -> EEH check, bdfn=%04x PE#%d fstate=%x\n",
+       pr_devel(" -> EEH check, bdfn=%04x PE#%x fstate=%x\n",
                 (pdn->busno << 8) | (pdn->devfn), pe_no, fstate);
 
        /* Clear the frozen state if applicable */
index efe8b6bb168b97961f3943f7e2038a9faae2f702..d50c7d99baaf4c4c052cdd44c4245dd9243af304 100644 (file)
@@ -174,7 +174,7 @@ static void pnv_shutdown(void)
        opal_shutdown();
 }
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 static void pnv_kexec_wait_secondaries_down(void)
 {
        int my_cpu, i, notified = -1;
@@ -245,7 +245,7 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
                opal_reinit_cpus(OPAL_REINIT_CPUS_HILE_BE);
        }
 }
-#endif /* CONFIG_KEXEC */
+#endif /* CONFIG_KEXEC_CORE */
 
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 static unsigned long pnv_memory_block_size(void)
@@ -311,7 +311,7 @@ define_machine(powernv) {
        .machine_shutdown       = pnv_shutdown,
        .power_save             = NULL,
        .calibrate_decr         = generic_calibrate_decr,
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        .kexec_cpu_down         = pnv_kexec_cpu_down,
 #endif
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
index cb3c50328de8758eea51258b556160b0be1b248a..cc2b281a3766c697e204479b3d9575e3d0dfb14b 100644 (file)
@@ -63,7 +63,7 @@ static long ps3_hpte_insert(unsigned long hpte_group, unsigned long vpn,
        vflags &= ~HPTE_V_SECONDARY;
 
        hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
-       hpte_r = hpte_encode_r(ps3_mm_phys_to_lpar(pa), psize, apsize, ssize) | rflags;
+       hpte_r = hpte_encode_r(ps3_mm_phys_to_lpar(pa), psize, apsize) | rflags;
 
        spin_lock_irqsave(&ps3_htab_lock, flags);
 
index 3a487e7f4a5e73e049a91780bdde0c38ed3de9ac..6244bc849469e33af7dcc5a4ac2b21dd970ac1c6 100644 (file)
@@ -250,7 +250,7 @@ static int __init ps3_probe(void)
        return 1;
 }
 
-#if defined(CONFIG_KEXEC)
+#if defined(CONFIG_KEXEC_CORE)
 static void ps3_kexec_cpu_down(int crash_shutdown, int secondary)
 {
        int cpu = smp_processor_id();
@@ -276,7 +276,7 @@ define_machine(ps3) {
        .progress                       = ps3_progress,
        .restart                        = ps3_restart,
        .halt                           = ps3_halt,
-#if defined(CONFIG_KEXEC)
+#if defined(CONFIG_KEXEC_CORE)
        .kexec_cpu_down                 = ps3_kexec_cpu_down,
 #endif
 };
index bec90fb3042548abd6a8e2b921aeff6c2730808f..e1c280a95d58ea8b1ce85ee6f1e7f41ec573c824 100644 (file)
@@ -127,3 +127,14 @@ config HV_PERF_CTRS
          systems. 24x7 is available on Power 8 systems.
 
           If unsure, select Y.
+
+config IBMVIO
+       depends on PPC_PSERIES
+       bool
+       default y
+
+config IBMEBUS
+       depends on PPC_PSERIES && !CPU_LITTLE_ENDIAN
+       bool "Support for GX bus based adapters"
+       help
+         Bus device driver for GX bus based adapters.
index fedc2ccf029d9f195f7a988872a2edeed9c36338..8f4ba089e80273f7d9cf6ce88032f329eaef64b8 100644 (file)
@@ -8,7 +8,7 @@ obj-y                   := lpar.o hvCall.o nvram.o reconfig.o \
                           pci.o pci_dlpar.o eeh_pseries.o msi.o
 obj-$(CONFIG_SMP)      += smp.o
 obj-$(CONFIG_SCANLOG)  += scanlog.o
-obj-$(CONFIG_KEXEC)    += kexec.o
+obj-$(CONFIG_KEXEC_CORE)       += kexec.o
 obj-$(CONFIG_PSERIES_ENERGY)   += pseries_energy.o
 
 obj-$(CONFIG_HOTPLUG_CPU)      += hotplug-cpu.o
@@ -21,6 +21,8 @@ obj-$(CONFIG_CMM)             += cmm.o
 obj-$(CONFIG_DTL)              += dtl.o
 obj-$(CONFIG_IO_EVENT_IRQ)     += io_event_irq.o
 obj-$(CONFIG_LPARCFG)          += lparcfg.o
+obj-$(CONFIG_IBMVIO)           += vio.o
+obj-$(CONFIG_IBMEBUS)          += ibmebus.o
 
 ifeq ($(CONFIG_PPC_PSERIES),y)
 obj-$(CONFIG_SUSPEND)          += suspend.o
index 423e450efe07cbf9d442f039f7148337251f561b..76caa4a45ccd8408a9701fd48cd589b0c07d297d 100644 (file)
@@ -418,84 +418,136 @@ void queue_hotplug_event(struct pseries_hp_errorlog *hp_errlog,
        }
 }
 
-static ssize_t dlpar_store(struct class *class, struct class_attribute *attr,
-                          const char *buf, size_t count)
+static int dlpar_parse_resource(char **cmd, struct pseries_hp_errorlog *hp_elog)
 {
-       struct pseries_hp_errorlog *hp_elog;
-       struct completion hotplug_done;
-       const char *arg;
-       int rc;
+       char *arg;
 
-       hp_elog = kzalloc(sizeof(*hp_elog), GFP_KERNEL);
-       if (!hp_elog) {
-               rc = -ENOMEM;
-               goto dlpar_store_out;
-       }
+       arg = strsep(cmd, " ");
+       if (!arg)
+               return -EINVAL;
 
-       /* Parse out the request from the user, this will be in the form
-        * <resource> <action> <id_type> <id>
-        */
-       arg = buf;
-       if (!strncmp(arg, "memory", 6)) {
+       if (sysfs_streq(arg, "memory")) {
                hp_elog->resource = PSERIES_HP_ELOG_RESOURCE_MEM;
-               arg += strlen("memory ");
-       } else if (!strncmp(arg, "cpu", 3)) {
+       } else if (sysfs_streq(arg, "cpu")) {
                hp_elog->resource = PSERIES_HP_ELOG_RESOURCE_CPU;
-               arg += strlen("cpu ");
        } else {
-               pr_err("Invalid resource specified: \"%s\"\n", buf);
-               rc = -EINVAL;
-               goto dlpar_store_out;
+               pr_err("Invalid resource specified.\n");
+               return -EINVAL;
        }
 
-       if (!strncmp(arg, "add", 3)) {
+       return 0;
+}
+
+static int dlpar_parse_action(char **cmd, struct pseries_hp_errorlog *hp_elog)
+{
+       char *arg;
+
+       arg = strsep(cmd, " ");
+       if (!arg)
+               return -EINVAL;
+
+       if (sysfs_streq(arg, "add")) {
                hp_elog->action = PSERIES_HP_ELOG_ACTION_ADD;
-               arg += strlen("add ");
-       } else if (!strncmp(arg, "remove", 6)) {
+       } else if (sysfs_streq(arg, "remove")) {
                hp_elog->action = PSERIES_HP_ELOG_ACTION_REMOVE;
-               arg += strlen("remove ");
        } else {
-               pr_err("Invalid action specified: \"%s\"\n", buf);
-               rc = -EINVAL;
-               goto dlpar_store_out;
+               pr_err("Invalid action specified.\n");
+               return -EINVAL;
        }
 
-       if (!strncmp(arg, "index", 5)) {
-               u32 index;
+       return 0;
+}
 
+static int dlpar_parse_id_type(char **cmd, struct pseries_hp_errorlog *hp_elog)
+{
+       char *arg;
+       u32 count, index;
+
+       arg = strsep(cmd, " ");
+       if (!arg)
+               return -EINVAL;
+
+       if (sysfs_streq(arg, "index")) {
                hp_elog->id_type = PSERIES_HP_ELOG_ID_DRC_INDEX;
-               arg += strlen("index ");
+               arg = strsep(cmd, " ");
+               if (!arg) {
+                       pr_err("No DRC Index specified.\n");
+                       return -EINVAL;
+               }
+
                if (kstrtou32(arg, 0, &index)) {
-                       rc = -EINVAL;
-                       pr_err("Invalid drc_index specified: \"%s\"\n", buf);
-                       goto dlpar_store_out;
+                       pr_err("Invalid DRC Index specified.\n");
+                       return -EINVAL;
                }
 
                hp_elog->_drc_u.drc_index = cpu_to_be32(index);
-       } else if (!strncmp(arg, "count", 5)) {
-               u32 count;
-
+       } else if (sysfs_streq(arg, "count")) {
                hp_elog->id_type = PSERIES_HP_ELOG_ID_DRC_COUNT;
-               arg += strlen("count ");
+               arg = strsep(cmd, " ");
+               if (!arg) {
+                       pr_err("No DRC count specified.\n");
+                       return -EINVAL;
+               }
+
                if (kstrtou32(arg, 0, &count)) {
-                       rc = -EINVAL;
-                       pr_err("Invalid count specified: \"%s\"\n", buf);
-                       goto dlpar_store_out;
+                       pr_err("Invalid DRC count specified.\n");
+                       return -EINVAL;
                }
 
                hp_elog->_drc_u.drc_count = cpu_to_be32(count);
        } else {
-               pr_err("Invalid id_type specified: \"%s\"\n", buf);
-               rc = -EINVAL;
-               goto dlpar_store_out;
+               pr_err("Invalid id_type specified.\n");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static ssize_t dlpar_store(struct class *class, struct class_attribute *attr,
+                          const char *buf, size_t count)
+{
+       struct pseries_hp_errorlog *hp_elog;
+       struct completion hotplug_done;
+       char *argbuf;
+       char *args;
+       int rc;
+
+       args = argbuf = kstrdup(buf, GFP_KERNEL);
+       hp_elog = kzalloc(sizeof(*hp_elog), GFP_KERNEL);
+       if (!hp_elog || !argbuf) {
+               pr_info("Could not allocate resources for DLPAR operation\n");
+               kfree(argbuf);
+               kfree(hp_elog);
+               return -ENOMEM;
        }
 
+       /*
+        * Parse out the request from the user, this will be in the form:
+        * <resource> <action> <id_type> <id>
+        */
+       rc = dlpar_parse_resource(&args, hp_elog);
+       if (rc)
+               goto dlpar_store_out;
+
+       rc = dlpar_parse_action(&args, hp_elog);
+       if (rc)
+               goto dlpar_store_out;
+
+       rc = dlpar_parse_id_type(&args, hp_elog);
+       if (rc)
+               goto dlpar_store_out;
+
        init_completion(&hotplug_done);
        queue_hotplug_event(hp_elog, &hotplug_done, &rc);
        wait_for_completion(&hotplug_done);
 
 dlpar_store_out:
+       kfree(argbuf);
        kfree(hp_elog);
+
+       if (rc)
+               pr_err("Could not handle DLPAR request \"%s\"\n", buf);
+
        return rc ? rc : count;
 }
 
index 1c428f06b14c8447729e1aca01954777ab2e90f0..1eef46d9cf304bc5b7a8457f13f6dd59e339d1dc 100644 (file)
@@ -270,7 +270,7 @@ static void *pseries_eeh_probe(struct pci_dn *pdn, void *data)
                        eeh_add_flag(EEH_ENABLED);
                        eeh_add_to_parent_pe(edev);
 
-                       pr_debug("%s: EEH enabled on %02x:%02x.%01x PHB#%d-PE#%x\n",
+                       pr_debug("%s: EEH enabled on %02x:%02x.%01x PHB#%x-PE#%x\n",
                                __func__, pdn->busno, PCI_SLOT(pdn->devfn),
                                PCI_FUNC(pdn->devfn), pe.phb->global_number,
                                pe.addr);
@@ -371,7 +371,7 @@ static int pseries_eeh_get_pe_addr(struct eeh_pe *pe)
                                pe->config_addr, BUID_HI(pe->phb->buid),
                                BUID_LO(pe->phb->buid), 0);
                if (ret) {
-                       pr_warn("%s: Failed to get address for PHB#%d-PE#%x\n",
+                       pr_warn("%s: Failed to get address for PHB#%x-PE#%x\n",
                                __func__, pe->phb->global_number, pe->config_addr);
                        return 0;
                }
@@ -384,7 +384,7 @@ static int pseries_eeh_get_pe_addr(struct eeh_pe *pe)
                                pe->config_addr, BUID_HI(pe->phb->buid),
                                BUID_LO(pe->phb->buid), 0);
                if (ret) {
-                       pr_warn("%s: Failed to get address for PHB#%d-PE#%x\n",
+                       pr_warn("%s: Failed to get address for PHB#%x-PE#%x\n",
                                __func__, pe->phb->global_number, pe->config_addr);
                        return 0;
                }
@@ -653,7 +653,7 @@ static int pseries_eeh_configure_bridge(struct eeh_pe *pe)
                rtas_busy_delay(ret);
        }
 
-       pr_warn("%s: Unable to configure bridge PHB#%d-PE#%x (%d)\n",
+       pr_warn("%s: Unable to configure bridge PHB#%x-PE#%x (%d)\n",
                __func__, pe->phb->global_number, pe->addr, ret);
        return ret;
 }
index 76ec104e88beea0e89e3473d988e23fbdb7312c1..2617f9f356bde2f224fb9501b139b901e15ff11a 100644 (file)
@@ -472,12 +472,15 @@ static int dlpar_memory_remove_by_count(u32 lmbs_to_remove,
 
        /* Validate that there are enough LMBs to satisfy the request */
        for (i = 0; i < num_lmbs; i++) {
-               if (lmbs[i].flags & DRCONF_MEM_ASSIGNED)
+               if (lmb_is_removable(&lmbs[i]))
                        lmbs_available++;
        }
 
-       if (lmbs_available < lmbs_to_remove)
+       if (lmbs_available < lmbs_to_remove) {
+               pr_info("Not enough LMBs available (%d of %d) to satisfy request\n",
+                       lmbs_available, lmbs_to_remove);
                return -EINVAL;
+       }
 
        for (i = 0; i < num_lmbs && lmbs_removed < lmbs_to_remove; i++) {
                rc = dlpar_remove_lmb(&lmbs[i]);
diff --git a/arch/powerpc/platforms/pseries/ibmebus.c b/arch/powerpc/platforms/pseries/ibmebus.c
new file mode 100644 (file)
index 0000000..614c285
--- /dev/null
@@ -0,0 +1,469 @@
+/*
+ * IBM PowerPC IBM eBus Infrastructure Support.
+ *
+ * Copyright (c) 2005 IBM Corporation
+ *  Joachim Fenkes <fenkes@de.ibm.com>
+ *  Heiko J Schick <schickhj@de.ibm.com>
+ *
+ * All rights reserved.
+ *
+ * This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ * BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/init.h>
+#include <linux/export.h>
+#include <linux/console.h>
+#include <linux/kobject.h>
+#include <linux/dma-mapping.h>
+#include <linux/interrupt.h>
+#include <linux/of.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/of_platform.h>
+#include <asm/ibmebus.h>
+
+static struct device ibmebus_bus_device = { /* fake "parent" device */
+       .init_name = "ibmebus",
+};
+
+struct bus_type ibmebus_bus_type;
+
+/* These devices will automatically be added to the bus during init */
+static const struct of_device_id ibmebus_matches[] __initconst = {
+       { .compatible = "IBM,lhca" },
+       { .compatible = "IBM,lhea" },
+       {},
+};
+
+static void *ibmebus_alloc_coherent(struct device *dev,
+                                   size_t size,
+                                   dma_addr_t *dma_handle,
+                                   gfp_t flag,
+                                   unsigned long attrs)
+{
+       void *mem;
+
+       mem = kmalloc(size, flag);
+       *dma_handle = (dma_addr_t)mem;
+
+       return mem;
+}
+
+static void ibmebus_free_coherent(struct device *dev,
+                                 size_t size, void *vaddr,
+                                 dma_addr_t dma_handle,
+                                 unsigned long attrs)
+{
+       kfree(vaddr);
+}
+
+static dma_addr_t ibmebus_map_page(struct device *dev,
+                                  struct page *page,
+                                  unsigned long offset,
+                                  size_t size,
+                                  enum dma_data_direction direction,
+                                  unsigned long attrs)
+{
+       return (dma_addr_t)(page_address(page) + offset);
+}
+
+static void ibmebus_unmap_page(struct device *dev,
+                              dma_addr_t dma_addr,
+                              size_t size,
+                              enum dma_data_direction direction,
+                              unsigned long attrs)
+{
+       return;
+}
+
+static int ibmebus_map_sg(struct device *dev,
+                         struct scatterlist *sgl,
+                         int nents, enum dma_data_direction direction,
+                         unsigned long attrs)
+{
+       struct scatterlist *sg;
+       int i;
+
+       for_each_sg(sgl, sg, nents, i) {
+               sg->dma_address = (dma_addr_t) sg_virt(sg);
+               sg->dma_length = sg->length;
+       }
+
+       return nents;
+}
+
+static void ibmebus_unmap_sg(struct device *dev,
+                            struct scatterlist *sg,
+                            int nents, enum dma_data_direction direction,
+                            unsigned long attrs)
+{
+       return;
+}
+
+static int ibmebus_dma_supported(struct device *dev, u64 mask)
+{
+       return mask == DMA_BIT_MASK(64);
+}
+
+static u64 ibmebus_dma_get_required_mask(struct device *dev)
+{
+       return DMA_BIT_MASK(64);
+}
+
+static struct dma_map_ops ibmebus_dma_ops = {
+       .alloc              = ibmebus_alloc_coherent,
+       .free               = ibmebus_free_coherent,
+       .map_sg             = ibmebus_map_sg,
+       .unmap_sg           = ibmebus_unmap_sg,
+       .dma_supported      = ibmebus_dma_supported,
+       .get_required_mask  = ibmebus_dma_get_required_mask,
+       .map_page           = ibmebus_map_page,
+       .unmap_page         = ibmebus_unmap_page,
+};
+
+static int ibmebus_match_path(struct device *dev, void *data)
+{
+       struct device_node *dn = to_platform_device(dev)->dev.of_node;
+       return (dn->full_name &&
+               (strcasecmp((char *)data, dn->full_name) == 0));
+}
+
+static int ibmebus_match_node(struct device *dev, void *data)
+{
+       return to_platform_device(dev)->dev.of_node == data;
+}
+
+static int ibmebus_create_device(struct device_node *dn)
+{
+       struct platform_device *dev;
+       int ret;
+
+       dev = of_device_alloc(dn, NULL, &ibmebus_bus_device);
+       if (!dev)
+               return -ENOMEM;
+
+       dev->dev.bus = &ibmebus_bus_type;
+       dev->dev.archdata.dma_ops = &ibmebus_dma_ops;
+
+       ret = of_device_add(dev);
+       if (ret)
+               platform_device_put(dev);
+       return ret;
+}
+
+static int ibmebus_create_devices(const struct of_device_id *matches)
+{
+       struct device_node *root, *child;
+       struct device *dev;
+       int ret = 0;
+
+       root = of_find_node_by_path("/");
+
+       for_each_child_of_node(root, child) {
+               if (!of_match_node(matches, child))
+                       continue;
+
+               dev = bus_find_device(&ibmebus_bus_type, NULL, child,
+                                     ibmebus_match_node);
+               if (dev) {
+                       put_device(dev);
+                       continue;
+               }
+
+               ret = ibmebus_create_device(child);
+               if (ret) {
+                       printk(KERN_ERR "%s: failed to create device (%i)",
+                              __func__, ret);
+                       of_node_put(child);
+                       break;
+               }
+       }
+
+       of_node_put(root);
+       return ret;
+}
+
+int ibmebus_register_driver(struct platform_driver *drv)
+{
+       /* If the driver uses devices that ibmebus doesn't know, add them */
+       ibmebus_create_devices(drv->driver.of_match_table);
+
+       drv->driver.bus = &ibmebus_bus_type;
+       return driver_register(&drv->driver);
+}
+EXPORT_SYMBOL(ibmebus_register_driver);
+
+void ibmebus_unregister_driver(struct platform_driver *drv)
+{
+       driver_unregister(&drv->driver);
+}
+EXPORT_SYMBOL(ibmebus_unregister_driver);
+
+int ibmebus_request_irq(u32 ist, irq_handler_t handler,
+                       unsigned long irq_flags, const char *devname,
+                       void *dev_id)
+{
+       unsigned int irq = irq_create_mapping(NULL, ist);
+
+       if (!irq)
+               return -EINVAL;
+
+       return request_irq(irq, handler, irq_flags, devname, dev_id);
+}
+EXPORT_SYMBOL(ibmebus_request_irq);
+
+void ibmebus_free_irq(u32 ist, void *dev_id)
+{
+       unsigned int irq = irq_find_mapping(NULL, ist);
+
+       free_irq(irq, dev_id);
+       irq_dispose_mapping(irq);
+}
+EXPORT_SYMBOL(ibmebus_free_irq);
+
+static char *ibmebus_chomp(const char *in, size_t count)
+{
+       char *out = kmalloc(count + 1, GFP_KERNEL);
+
+       if (!out)
+               return NULL;
+
+       memcpy(out, in, count);
+       out[count] = '\0';
+       if (out[count - 1] == '\n')
+               out[count - 1] = '\0';
+
+       return out;
+}
+
+static ssize_t ibmebus_store_probe(struct bus_type *bus,
+                                  const char *buf, size_t count)
+{
+       struct device_node *dn = NULL;
+       struct device *dev;
+       char *path;
+       ssize_t rc = 0;
+
+       path = ibmebus_chomp(buf, count);
+       if (!path)
+               return -ENOMEM;
+
+       dev = bus_find_device(&ibmebus_bus_type, NULL, path,
+                             ibmebus_match_path);
+       if (dev) {
+               put_device(dev);
+               printk(KERN_WARNING "%s: %s has already been probed\n",
+                      __func__, path);
+               rc = -EEXIST;
+               goto out;
+       }
+
+       if ((dn = of_find_node_by_path(path))) {
+               rc = ibmebus_create_device(dn);
+               of_node_put(dn);
+       } else {
+               printk(KERN_WARNING "%s: no such device node: %s\n",
+                      __func__, path);
+               rc = -ENODEV;
+       }
+
+out:
+       kfree(path);
+       if (rc)
+               return rc;
+       return count;
+}
+static BUS_ATTR(probe, S_IWUSR, NULL, ibmebus_store_probe);
+
+static ssize_t ibmebus_store_remove(struct bus_type *bus,
+                                   const char *buf, size_t count)
+{
+       struct device *dev;
+       char *path;
+
+       path = ibmebus_chomp(buf, count);
+       if (!path)
+               return -ENOMEM;
+
+       if ((dev = bus_find_device(&ibmebus_bus_type, NULL, path,
+                                  ibmebus_match_path))) {
+               of_device_unregister(to_platform_device(dev));
+               put_device(dev);
+
+               kfree(path);
+               return count;
+       } else {
+               printk(KERN_WARNING "%s: %s not on the bus\n",
+                      __func__, path);
+
+               kfree(path);
+               return -ENODEV;
+       }
+}
+static BUS_ATTR(remove, S_IWUSR, NULL, ibmebus_store_remove);
+
+static struct attribute *ibmbus_bus_attrs[] = {
+       &bus_attr_probe.attr,
+       &bus_attr_remove.attr,
+       NULL,
+};
+ATTRIBUTE_GROUPS(ibmbus_bus);
+
+static int ibmebus_bus_bus_match(struct device *dev, struct device_driver *drv)
+{
+       const struct of_device_id *matches = drv->of_match_table;
+
+       if (!matches)
+               return 0;
+
+       return of_match_device(matches, dev) != NULL;
+}
+
+static int ibmebus_bus_device_probe(struct device *dev)
+{
+       int error = -ENODEV;
+       struct platform_driver *drv;
+       struct platform_device *of_dev;
+
+       drv = to_platform_driver(dev->driver);
+       of_dev = to_platform_device(dev);
+
+       if (!drv->probe)
+               return error;
+
+       of_dev_get(of_dev);
+
+       if (of_driver_match_device(dev, dev->driver))
+               error = drv->probe(of_dev);
+       if (error)
+               of_dev_put(of_dev);
+
+       return error;
+}
+
+static int ibmebus_bus_device_remove(struct device *dev)
+{
+       struct platform_device *of_dev = to_platform_device(dev);
+       struct platform_driver *drv = to_platform_driver(dev->driver);
+
+       if (dev->driver && drv->remove)
+               drv->remove(of_dev);
+       return 0;
+}
+
+static void ibmebus_bus_device_shutdown(struct device *dev)
+{
+       struct platform_device *of_dev = to_platform_device(dev);
+       struct platform_driver *drv = to_platform_driver(dev->driver);
+
+       if (dev->driver && drv->shutdown)
+               drv->shutdown(of_dev);
+}
+
+/*
+ * ibmebus_bus_device_attrs
+ */
+static ssize_t devspec_show(struct device *dev,
+                               struct device_attribute *attr, char *buf)
+{
+       struct platform_device *ofdev;
+
+       ofdev = to_platform_device(dev);
+       return sprintf(buf, "%s\n", ofdev->dev.of_node->full_name);
+}
+
+static ssize_t name_show(struct device *dev,
+                               struct device_attribute *attr, char *buf)
+{
+       struct platform_device *ofdev;
+
+       ofdev = to_platform_device(dev);
+       return sprintf(buf, "%s\n", ofdev->dev.of_node->name);
+}
+
+static ssize_t modalias_show(struct device *dev,
+                               struct device_attribute *attr, char *buf)
+{
+       ssize_t len = of_device_get_modalias(dev, buf, PAGE_SIZE - 2);
+       buf[len] = '\n';
+       buf[len+1] = 0;
+       return len+1;
+}
+
+static struct device_attribute ibmebus_bus_device_attrs[] = {
+       __ATTR_RO(devspec),
+       __ATTR_RO(name),
+       __ATTR_RO(modalias),
+       __ATTR_NULL
+};
+
+struct bus_type ibmebus_bus_type = {
+       .name      = "ibmebus",
+       .uevent    = of_device_uevent_modalias,
+       .bus_groups = ibmbus_bus_groups,
+       .match     = ibmebus_bus_bus_match,
+       .probe     = ibmebus_bus_device_probe,
+       .remove    = ibmebus_bus_device_remove,
+       .shutdown  = ibmebus_bus_device_shutdown,
+       .dev_attrs = ibmebus_bus_device_attrs,
+};
+EXPORT_SYMBOL(ibmebus_bus_type);
+
+static int __init ibmebus_bus_init(void)
+{
+       int err;
+
+       printk(KERN_INFO "IBM eBus Device Driver\n");
+
+       err = bus_register(&ibmebus_bus_type);
+       if (err) {
+               printk(KERN_ERR "%s: failed to register IBM eBus.\n",
+                      __func__);
+               return err;
+       }
+
+       err = device_register(&ibmebus_bus_device);
+       if (err) {
+               printk(KERN_WARNING "%s: device_register returned %i\n",
+                      __func__, err);
+               bus_unregister(&ibmebus_bus_type);
+
+               return err;
+       }
+
+       err = ibmebus_create_devices(ibmebus_matches);
+       if (err) {
+               device_unregister(&ibmebus_bus_device);
+               bus_unregister(&ibmebus_bus_type);
+               return err;
+       }
+
+       return 0;
+}
+postcore_initcall(ibmebus_bus_init);
index 24ad43afbb46bfba6a84e7bc1e924abda4478786..486e570c18699187a9cedf888d42709827055b14 100644 (file)
@@ -145,7 +145,7 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
                         hpte_group, vpn,  pa, rflags, vflags, psize);
 
        hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
-       hpte_r = hpte_encode_r(pa, psize, apsize, ssize) | rflags;
+       hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
 
        if (!(vflags & HPTE_V_BOLTED))
                pr_devel(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r);
index 97aa3f332f240cb056c1d4bca4f59d4125d6e52b..7736352f7279b86a60ea727464082bc1a9aee8ce 100644 (file)
@@ -367,7 +367,7 @@ void pseries_disable_reloc_on_exc(void)
 }
 EXPORT_SYMBOL(pseries_disable_reloc_on_exc);
 
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
 static void pSeries_machine_kexec(struct kimage *image)
 {
        if (firmware_has_feature(FW_FEATURE_SET_MODE))
@@ -725,7 +725,7 @@ define_machine(pseries) {
        .progress               = rtas_progress,
        .system_reset_exception = pSeries_system_reset_exception,
        .machine_check_exception = pSeries_machine_check_exception,
-#ifdef CONFIG_KEXEC
+#ifdef CONFIG_KEXEC_CORE
        .machine_kexec          = pSeries_machine_kexec,
        .kexec_cpu_down         = pseries_kexec_cpu_down,
 #endif
diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c
new file mode 100644 (file)
index 0000000..2c8fb3e
--- /dev/null
@@ -0,0 +1,1705 @@
+/*
+ * IBM PowerPC Virtual I/O Infrastructure Support.
+ *
+ *    Copyright (c) 2003,2008 IBM Corp.
+ *     Dave Engebretsen engebret@us.ibm.com
+ *     Santiago Leon santil@us.ibm.com
+ *     Hollis Blanchard <hollisb@us.ibm.com>
+ *     Stephen Rothwell
+ *     Robert Jennings <rcjenn@us.ibm.com>
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/cpu.h>
+#include <linux/types.h>
+#include <linux/delay.h>
+#include <linux/stat.h>
+#include <linux/device.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/console.h>
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/kobject.h>
+
+#include <asm/iommu.h>
+#include <asm/dma.h>
+#include <asm/vio.h>
+#include <asm/prom.h>
+#include <asm/firmware.h>
+#include <asm/tce.h>
+#include <asm/page.h>
+#include <asm/hvcall.h>
+
+static struct vio_dev vio_bus_device  = { /* fake "parent" device */
+       .name = "vio",
+       .type = "",
+       .dev.init_name = "vio",
+       .dev.bus = &vio_bus_type,
+};
+
+#ifdef CONFIG_PPC_SMLPAR
+/**
+ * vio_cmo_pool - A pool of IO memory for CMO use
+ *
+ * @size: The size of the pool in bytes
+ * @free: The amount of free memory in the pool
+ */
+struct vio_cmo_pool {
+       size_t size;
+       size_t free;
+};
+
+/* How many ms to delay queued balance work */
+#define VIO_CMO_BALANCE_DELAY 100
+
+/* Portion out IO memory to CMO devices by this chunk size */
+#define VIO_CMO_BALANCE_CHUNK 131072
+
+/**
+ * vio_cmo_dev_entry - A device that is CMO-enabled and requires entitlement
+ *
+ * @vio_dev: struct vio_dev pointer
+ * @list: pointer to other devices on bus that are being tracked
+ */
+struct vio_cmo_dev_entry {
+       struct vio_dev *viodev;
+       struct list_head list;
+};
+
+/**
+ * vio_cmo - VIO bus accounting structure for CMO entitlement
+ *
+ * @lock: spinlock for entire structure
+ * @balance_q: work queue for balancing system entitlement
+ * @device_list: list of CMO-enabled devices requiring entitlement
+ * @entitled: total system entitlement in bytes
+ * @reserve: pool of memory from which devices reserve entitlement, incl. spare
+ * @excess: pool of excess entitlement not needed for device reserves or spare
+ * @spare: IO memory for device hotplug functionality
+ * @min: minimum necessary for system operation
+ * @desired: desired memory for system operation
+ * @curr: bytes currently allocated
+ * @high: high water mark for IO data usage
+ */
+static struct vio_cmo {
+       spinlock_t lock;
+       struct delayed_work balance_q;
+       struct list_head device_list;
+       size_t entitled;
+       struct vio_cmo_pool reserve;
+       struct vio_cmo_pool excess;
+       size_t spare;
+       size_t min;
+       size_t desired;
+       size_t curr;
+       size_t high;
+} vio_cmo;
+
+/**
+ * vio_cmo_OF_devices - Count the number of OF devices that have DMA windows
+ */
+static int vio_cmo_num_OF_devs(void)
+{
+       struct device_node *node_vroot;
+       int count = 0;
+
+       /*
+        * Count the number of vdevice entries with an
+        * ibm,my-dma-window OF property
+        */
+       node_vroot = of_find_node_by_name(NULL, "vdevice");
+       if (node_vroot) {
+               struct device_node *of_node;
+               struct property *prop;
+
+               for_each_child_of_node(node_vroot, of_node) {
+                       prop = of_find_property(of_node, "ibm,my-dma-window",
+                                              NULL);
+                       if (prop)
+                               count++;
+               }
+       }
+       of_node_put(node_vroot);
+       return count;
+}
+
+/**
+ * vio_cmo_alloc - allocate IO memory for CMO-enable devices
+ *
+ * @viodev: VIO device requesting IO memory
+ * @size: size of allocation requested
+ *
+ * Allocations come from memory reserved for the devices and any excess
+ * IO memory available to all devices.  The spare pool used to service
+ * hotplug must be equal to %VIO_CMO_MIN_ENT for the excess pool to be
+ * made available.
+ *
+ * Return codes:
+ *  0 for successful allocation and -ENOMEM for a failure
+ */
+static inline int vio_cmo_alloc(struct vio_dev *viodev, size_t size)
+{
+       unsigned long flags;
+       size_t reserve_free = 0;
+       size_t excess_free = 0;
+       int ret = -ENOMEM;
+
+       spin_lock_irqsave(&vio_cmo.lock, flags);
+
+       /* Determine the amount of free entitlement available in reserve */
+       if (viodev->cmo.entitled > viodev->cmo.allocated)
+               reserve_free = viodev->cmo.entitled - viodev->cmo.allocated;
+
+       /* If spare is not fulfilled, the excess pool can not be used. */
+       if (vio_cmo.spare >= VIO_CMO_MIN_ENT)
+               excess_free = vio_cmo.excess.free;
+
+       /* The request can be satisfied */
+       if ((reserve_free + excess_free) >= size) {
+               vio_cmo.curr += size;
+               if (vio_cmo.curr > vio_cmo.high)
+                       vio_cmo.high = vio_cmo.curr;
+               viodev->cmo.allocated += size;
+               size -= min(reserve_free, size);
+               vio_cmo.excess.free -= size;
+               ret = 0;
+       }
+
+       spin_unlock_irqrestore(&vio_cmo.lock, flags);
+       return ret;
+}
+
+/**
+ * vio_cmo_dealloc - deallocate IO memory from CMO-enable devices
+ * @viodev: VIO device freeing IO memory
+ * @size: size of deallocation
+ *
+ * IO memory is freed by the device back to the correct memory pools.
+ * The spare pool is replenished first from either memory pool, then
+ * the reserve pool is used to reduce device entitlement, the excess
+ * pool is used to increase the reserve pool toward the desired entitlement
+ * target, and then the remaining memory is returned to the pools.
+ *
+ */
+static inline void vio_cmo_dealloc(struct vio_dev *viodev, size_t size)
+{
+       unsigned long flags;
+       size_t spare_needed = 0;
+       size_t excess_freed = 0;
+       size_t reserve_freed = size;
+       size_t tmp;
+       int balance = 0;
+
+       spin_lock_irqsave(&vio_cmo.lock, flags);
+       vio_cmo.curr -= size;
+
+       /* Amount of memory freed from the excess pool */
+       if (viodev->cmo.allocated > viodev->cmo.entitled) {
+               excess_freed = min(reserve_freed, (viodev->cmo.allocated -
+                                                  viodev->cmo.entitled));
+               reserve_freed -= excess_freed;
+       }
+
+       /* Remove allocation from device */
+       viodev->cmo.allocated -= (reserve_freed + excess_freed);
+
+       /* Spare is a subset of the reserve pool, replenish it first. */
+       spare_needed = VIO_CMO_MIN_ENT - vio_cmo.spare;
+
+       /*
+        * Replenish the spare in the reserve pool from the excess pool.
+        * This moves entitlement into the reserve pool.
+        */
+       if (spare_needed && excess_freed) {
+               tmp = min(excess_freed, spare_needed);
+               vio_cmo.excess.size -= tmp;
+               vio_cmo.reserve.size += tmp;
+               vio_cmo.spare += tmp;
+               excess_freed -= tmp;
+               spare_needed -= tmp;
+               balance = 1;
+       }
+
+       /*
+        * Replenish the spare in the reserve pool from the reserve pool.
+        * This removes entitlement from the device down to VIO_CMO_MIN_ENT,
+        * if needed, and gives it to the spare pool. The amount of used
+        * memory in this pool does not change.
+        */
+       if (spare_needed && reserve_freed) {
+               tmp = min3(spare_needed, reserve_freed, (viodev->cmo.entitled - VIO_CMO_MIN_ENT));
+
+               vio_cmo.spare += tmp;
+               viodev->cmo.entitled -= tmp;
+               reserve_freed -= tmp;
+               spare_needed -= tmp;
+               balance = 1;
+       }
+
+       /*
+        * Increase the reserve pool until the desired allocation is met.
+        * Move an allocation freed from the excess pool into the reserve
+        * pool and schedule a balance operation.
+        */
+       if (excess_freed && (vio_cmo.desired > vio_cmo.reserve.size)) {
+               tmp = min(excess_freed, (vio_cmo.desired - vio_cmo.reserve.size));
+
+               vio_cmo.excess.size -= tmp;
+               vio_cmo.reserve.size += tmp;
+               excess_freed -= tmp;
+               balance = 1;
+       }
+
+       /* Return memory from the excess pool to that pool */
+       if (excess_freed)
+               vio_cmo.excess.free += excess_freed;
+
+       if (balance)
+               schedule_delayed_work(&vio_cmo.balance_q, VIO_CMO_BALANCE_DELAY);
+       spin_unlock_irqrestore(&vio_cmo.lock, flags);
+}
+
+/**
+ * vio_cmo_entitlement_update - Manage system entitlement changes
+ *
+ * @new_entitlement: new system entitlement to attempt to accommodate
+ *
+ * Increases in entitlement will be used to fulfill the spare entitlement
+ * and the rest is given to the excess pool.  Decreases, if they are
+ * possible, come from the excess pool and from unused device entitlement
+ *
+ * Returns: 0 on success, -ENOMEM when change can not be made
+ */
+int vio_cmo_entitlement_update(size_t new_entitlement)
+{
+       struct vio_dev *viodev;
+       struct vio_cmo_dev_entry *dev_ent;
+       unsigned long flags;
+       size_t avail, delta, tmp;
+
+       spin_lock_irqsave(&vio_cmo.lock, flags);
+
+       /* Entitlement increases */
+       if (new_entitlement > vio_cmo.entitled) {
+               delta = new_entitlement - vio_cmo.entitled;
+
+               /* Fulfill spare allocation */
+               if (vio_cmo.spare < VIO_CMO_MIN_ENT) {
+                       tmp = min(delta, (VIO_CMO_MIN_ENT - vio_cmo.spare));
+                       vio_cmo.spare += tmp;
+                       vio_cmo.reserve.size += tmp;
+                       delta -= tmp;
+               }
+
+               /* Remaining new allocation goes to the excess pool */
+               vio_cmo.entitled += delta;
+               vio_cmo.excess.size += delta;
+               vio_cmo.excess.free += delta;
+
+               goto out;
+       }
+
+       /* Entitlement decreases */
+       delta = vio_cmo.entitled - new_entitlement;
+       avail = vio_cmo.excess.free;
+
+       /*
+        * Need to check how much unused entitlement each device can
+        * sacrifice to fulfill entitlement change.
+        */
+       list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+               if (avail >= delta)
+                       break;
+
+               viodev = dev_ent->viodev;
+               if ((viodev->cmo.entitled > viodev->cmo.allocated) &&
+                   (viodev->cmo.entitled > VIO_CMO_MIN_ENT))
+                               avail += viodev->cmo.entitled -
+                                        max_t(size_t, viodev->cmo.allocated,
+                                              VIO_CMO_MIN_ENT);
+       }
+
+       if (delta <= avail) {
+               vio_cmo.entitled -= delta;
+
+               /* Take entitlement from the excess pool first */
+               tmp = min(vio_cmo.excess.free, delta);
+               vio_cmo.excess.size -= tmp;
+               vio_cmo.excess.free -= tmp;
+               delta -= tmp;
+
+               /*
+                * Remove all but VIO_CMO_MIN_ENT bytes from devices
+                * until entitlement change is served
+                */
+               list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+                       if (!delta)
+                               break;
+
+                       viodev = dev_ent->viodev;
+                       tmp = 0;
+                       if ((viodev->cmo.entitled > viodev->cmo.allocated) &&
+                           (viodev->cmo.entitled > VIO_CMO_MIN_ENT))
+                               tmp = viodev->cmo.entitled -
+                                     max_t(size_t, viodev->cmo.allocated,
+                                           VIO_CMO_MIN_ENT);
+                       viodev->cmo.entitled -= min(tmp, delta);
+                       delta -= min(tmp, delta);
+               }
+       } else {
+               spin_unlock_irqrestore(&vio_cmo.lock, flags);
+               return -ENOMEM;
+       }
+
+out:
+       schedule_delayed_work(&vio_cmo.balance_q, 0);
+       spin_unlock_irqrestore(&vio_cmo.lock, flags);
+       return 0;
+}
+
+/**
+ * vio_cmo_balance - Balance entitlement among devices
+ *
+ * @work: work queue structure for this operation
+ *
+ * Any system entitlement above the minimum needed for devices, or
+ * already allocated to devices, can be distributed to the devices.
+ * The list of devices is iterated through to recalculate the desired
+ * entitlement level and to determine how much entitlement above the
+ * minimum entitlement is allocated to devices.
+ *
+ * Small chunks of the available entitlement are given to devices until
+ * their requirements are fulfilled or there is no entitlement left to give.
+ * Upon completion sizes of the reserve and excess pools are calculated.
+ *
+ * The system minimum entitlement level is also recalculated here.
+ * Entitlement will be reserved for devices even after vio_bus_remove to
+ * accommodate reloading the driver.  The OF tree is walked to count the
+ * number of devices present and this will remove entitlement for devices
+ * that have actually left the system after having vio_bus_remove called.
+ */
+static void vio_cmo_balance(struct work_struct *work)
+{
+       struct vio_cmo *cmo;
+       struct vio_dev *viodev;
+       struct vio_cmo_dev_entry *dev_ent;
+       unsigned long flags;
+       size_t avail = 0, level, chunk, need;
+       int devcount = 0, fulfilled;
+
+       cmo = container_of(work, struct vio_cmo, balance_q.work);
+
+       spin_lock_irqsave(&vio_cmo.lock, flags);
+
+       /* Calculate minimum entitlement and fulfill spare */
+       cmo->min = vio_cmo_num_OF_devs() * VIO_CMO_MIN_ENT;
+       BUG_ON(cmo->min > cmo->entitled);
+       cmo->spare = min_t(size_t, VIO_CMO_MIN_ENT, (cmo->entitled - cmo->min));
+       cmo->min += cmo->spare;
+       cmo->desired = cmo->min;
+
+       /*
+        * Determine how much entitlement is available and reset device
+        * entitlements
+        */
+       avail = cmo->entitled - cmo->spare;
+       list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+               viodev = dev_ent->viodev;
+               devcount++;
+               viodev->cmo.entitled = VIO_CMO_MIN_ENT;
+               cmo->desired += (viodev->cmo.desired - VIO_CMO_MIN_ENT);
+               avail -= max_t(size_t, viodev->cmo.allocated, VIO_CMO_MIN_ENT);
+       }
+
+       /*
+        * Having provided each device with the minimum entitlement, loop
+        * over the devices portioning out the remaining entitlement
+        * until there is nothing left.
+        */
+       level = VIO_CMO_MIN_ENT;
+       while (avail) {
+               fulfilled = 0;
+               list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+                       viodev = dev_ent->viodev;
+
+                       if (viodev->cmo.desired <= level) {
+                               fulfilled++;
+                               continue;
+                       }
+
+                       /*
+                        * Give the device up to VIO_CMO_BALANCE_CHUNK
+                        * bytes of entitlement, but do not exceed the
+                        * desired level of entitlement for the device.
+                        */
+                       chunk = min_t(size_t, avail, VIO_CMO_BALANCE_CHUNK);
+                       chunk = min(chunk, (viodev->cmo.desired -
+                                           viodev->cmo.entitled));
+                       viodev->cmo.entitled += chunk;
+
+                       /*
+                        * If the memory for this entitlement increase was
+                        * already allocated to the device it does not come
+                        * from the available pool being portioned out.
+                        */
+                       need = max(viodev->cmo.allocated, viodev->cmo.entitled)-
+                              max(viodev->cmo.allocated, level);
+                       avail -= need;
+
+               }
+               if (fulfilled == devcount)
+                       break;
+               level += VIO_CMO_BALANCE_CHUNK;
+       }
+
+       /* Calculate new reserve and excess pool sizes */
+       cmo->reserve.size = cmo->min;
+       cmo->excess.free = 0;
+       cmo->excess.size = 0;
+       need = 0;
+       list_for_each_entry(dev_ent, &vio_cmo.device_list, list) {
+               viodev = dev_ent->viodev;
+               /* Calculated reserve size above the minimum entitlement */
+               if (viodev->cmo.entitled)
+                       cmo->reserve.size += (viodev->cmo.entitled -
+                                             VIO_CMO_MIN_ENT);
+               /* Calculated used excess entitlement */
+               if (viodev->cmo.allocated > viodev->cmo.entitled)
+                       need += viodev->cmo.allocated - viodev->cmo.entitled;
+       }
+       cmo->excess.size = cmo->entitled - cmo->reserve.size;
+       cmo->excess.free = cmo->excess.size - need;
+
+       cancel_delayed_work(to_delayed_work(work));
+       spin_unlock_irqrestore(&vio_cmo.lock, flags);
+}
+
+static void *vio_dma_iommu_alloc_coherent(struct device *dev, size_t size,
+                                         dma_addr_t *dma_handle, gfp_t flag,
+                                         unsigned long attrs)
+{
+       struct vio_dev *viodev = to_vio_dev(dev);
+       void *ret;
+
+       if (vio_cmo_alloc(viodev, roundup(size, PAGE_SIZE))) {
+               atomic_inc(&viodev->cmo.allocs_failed);
+               return NULL;
+       }
+
+       ret = dma_iommu_ops.alloc(dev, size, dma_handle, flag, attrs);
+       if (unlikely(ret == NULL)) {
+               vio_cmo_dealloc(viodev, roundup(size, PAGE_SIZE));
+               atomic_inc(&viodev->cmo.allocs_failed);
+       }
+
+       return ret;
+}
+
+static void vio_dma_iommu_free_coherent(struct device *dev, size_t size,
+                                       void *vaddr, dma_addr_t dma_handle,
+                                       unsigned long attrs)
+{
+       struct vio_dev *viodev = to_vio_dev(dev);
+
+       dma_iommu_ops.free(dev, size, vaddr, dma_handle, attrs);
+
+       vio_cmo_dealloc(viodev, roundup(size, PAGE_SIZE));
+}
+
+static dma_addr_t vio_dma_iommu_map_page(struct device *dev, struct page *page,
+                                         unsigned long offset, size_t size,
+                                         enum dma_data_direction direction,
+                                         unsigned long attrs)
+{
+       struct vio_dev *viodev = to_vio_dev(dev);
+       struct iommu_table *tbl;
+       dma_addr_t ret = DMA_ERROR_CODE;
+
+       tbl = get_iommu_table_base(dev);
+       if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)))) {
+               atomic_inc(&viodev->cmo.allocs_failed);
+               return ret;
+       }
+
+       ret = dma_iommu_ops.map_page(dev, page, offset, size, direction, attrs);
+       if (unlikely(dma_mapping_error(dev, ret))) {
+               vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)));
+               atomic_inc(&viodev->cmo.allocs_failed);
+       }
+
+       return ret;
+}
+
+static void vio_dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle,
+                                    size_t size,
+                                    enum dma_data_direction direction,
+                                    unsigned long attrs)
+{
+       struct vio_dev *viodev = to_vio_dev(dev);
+       struct iommu_table *tbl;
+
+       tbl = get_iommu_table_base(dev);
+       dma_iommu_ops.unmap_page(dev, dma_handle, size, direction, attrs);
+
+       vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)));
+}
+
+static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
+                                int nelems, enum dma_data_direction direction,
+                                unsigned long attrs)
+{
+       struct vio_dev *viodev = to_vio_dev(dev);
+       struct iommu_table *tbl;
+       struct scatterlist *sgl;
+       int ret, count;
+       size_t alloc_size = 0;
+
+       tbl = get_iommu_table_base(dev);
+       for_each_sg(sglist, sgl, nelems, count)
+               alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE(tbl));
+
+       if (vio_cmo_alloc(viodev, alloc_size)) {
+               atomic_inc(&viodev->cmo.allocs_failed);
+               return 0;
+       }
+
+       ret = dma_iommu_ops.map_sg(dev, sglist, nelems, direction, attrs);
+
+       if (unlikely(!ret)) {
+               vio_cmo_dealloc(viodev, alloc_size);
+               atomic_inc(&viodev->cmo.allocs_failed);
+               return ret;
+       }
+
+       for_each_sg(sglist, sgl, ret, count)
+               alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl));
+       if (alloc_size)
+               vio_cmo_dealloc(viodev, alloc_size);
+
+       return ret;
+}
+
+static void vio_dma_iommu_unmap_sg(struct device *dev,
+               struct scatterlist *sglist, int nelems,
+               enum dma_data_direction direction,
+               unsigned long attrs)
+{
+       struct vio_dev *viodev = to_vio_dev(dev);
+       struct iommu_table *tbl;
+       struct scatterlist *sgl;
+       size_t alloc_size = 0;
+       int count;
+
+       tbl = get_iommu_table_base(dev);
+       for_each_sg(sglist, sgl, nelems, count)
+               alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl));
+
+       dma_iommu_ops.unmap_sg(dev, sglist, nelems, direction, attrs);
+
+       vio_cmo_dealloc(viodev, alloc_size);
+}
+
+static int vio_dma_iommu_dma_supported(struct device *dev, u64 mask)
+{
+        return dma_iommu_ops.dma_supported(dev, mask);
+}
+
+static u64 vio_dma_get_required_mask(struct device *dev)
+{
+        return dma_iommu_ops.get_required_mask(dev);
+}
+
+static struct dma_map_ops vio_dma_mapping_ops = {
+       .alloc             = vio_dma_iommu_alloc_coherent,
+       .free              = vio_dma_iommu_free_coherent,
+       .mmap              = dma_direct_mmap_coherent,
+       .map_sg            = vio_dma_iommu_map_sg,
+       .unmap_sg          = vio_dma_iommu_unmap_sg,
+       .map_page          = vio_dma_iommu_map_page,
+       .unmap_page        = vio_dma_iommu_unmap_page,
+       .dma_supported     = vio_dma_iommu_dma_supported,
+       .get_required_mask = vio_dma_get_required_mask,
+};
+
+/**
+ * vio_cmo_set_dev_desired - Set desired entitlement for a device
+ *
+ * @viodev: struct vio_dev for device to alter
+ * @desired: new desired entitlement level in bytes
+ *
+ * For use by devices to request a change to their entitlement at runtime or
+ * through sysfs.  The desired entitlement level is changed and a balancing
+ * of system resources is scheduled to run in the future.
+ */
+void vio_cmo_set_dev_desired(struct vio_dev *viodev, size_t desired)
+{
+       unsigned long flags;
+       struct vio_cmo_dev_entry *dev_ent;
+       int found = 0;
+
+       if (!firmware_has_feature(FW_FEATURE_CMO))
+               return;
+
+       spin_lock_irqsave(&vio_cmo.lock, flags);
+       if (desired < VIO_CMO_MIN_ENT)
+               desired = VIO_CMO_MIN_ENT;
+
+       /*
+        * Changes will not be made for devices not in the device list.
+        * If it is not in the device list, then no driver is loaded
+        * for the device and it can not receive entitlement.
+        */
+       list_for_each_entry(dev_ent, &vio_cmo.device_list, list)
+               if (viodev == dev_ent->viodev) {
+                       found = 1;
+                       break;
+               }
+       if (!found) {
+               spin_unlock_irqrestore(&vio_cmo.lock, flags);
+               return;
+       }
+
+       /* Increase/decrease in desired device entitlement */
+       if (desired >= viodev->cmo.desired) {
+               /* Just bump the bus and device values prior to a balance*/
+               vio_cmo.desired += desired - viodev->cmo.desired;
+               viodev->cmo.desired = desired;
+       } else {
+               /* Decrease bus and device values for desired entitlement */
+               vio_cmo.desired -= viodev->cmo.desired - desired;
+               viodev->cmo.desired = desired;
+               /*
+                * If less entitlement is desired than current entitlement, move
+                * any reserve memory in the change region to the excess pool.
+                */
+               if (viodev->cmo.entitled > desired) {
+                       vio_cmo.reserve.size -= viodev->cmo.entitled - desired;
+                       vio_cmo.excess.size += viodev->cmo.entitled - desired;
+                       /*
+                        * If entitlement moving from the reserve pool to the
+                        * excess pool is currently unused, add to the excess
+                        * free counter.
+                        */
+                       if (viodev->cmo.allocated < viodev->cmo.entitled)
+                               vio_cmo.excess.free += viodev->cmo.entitled -
+                                                      max(viodev->cmo.allocated, desired);
+                       viodev->cmo.entitled = desired;
+               }
+       }
+       schedule_delayed_work(&vio_cmo.balance_q, 0);
+       spin_unlock_irqrestore(&vio_cmo.lock, flags);
+}
+
+/**
+ * vio_cmo_bus_probe - Handle CMO specific bus probe activities
+ *
+ * @viodev - Pointer to struct vio_dev for device
+ *
+ * Determine the devices IO memory entitlement needs, attempting
+ * to satisfy the system minimum entitlement at first and scheduling
+ * a balance operation to take care of the rest at a later time.
+ *
+ * Returns: 0 on success, -EINVAL when device doesn't support CMO, and
+ *          -ENOMEM when entitlement is not available for device or
+ *          device entry.
+ *
+ */
+static int vio_cmo_bus_probe(struct vio_dev *viodev)
+{
+       struct vio_cmo_dev_entry *dev_ent;
+       struct device *dev = &viodev->dev;
+       struct iommu_table *tbl;
+       struct vio_driver *viodrv = to_vio_driver(dev->driver);
+       unsigned long flags;
+       size_t size;
+       bool dma_capable = false;
+
+       tbl = get_iommu_table_base(dev);
+
+       /* A device requires entitlement if it has a DMA window property */
+       switch (viodev->family) {
+       case VDEVICE:
+               if (of_get_property(viodev->dev.of_node,
+                                       "ibm,my-dma-window", NULL))
+                       dma_capable = true;
+               break;
+       case PFO:
+               dma_capable = false;
+               break;
+       default:
+               dev_warn(dev, "unknown device family: %d\n", viodev->family);
+               BUG();
+               break;
+       }
+
+       /* Configure entitlement for the device. */
+       if (dma_capable) {
+               /* Check that the driver is CMO enabled and get desired DMA */
+               if (!viodrv->get_desired_dma) {
+                       dev_err(dev, "%s: device driver does not support CMO\n",
+                               __func__);
+                       return -EINVAL;
+               }
+
+               viodev->cmo.desired =
+                       IOMMU_PAGE_ALIGN(viodrv->get_desired_dma(viodev), tbl);
+               if (viodev->cmo.desired < VIO_CMO_MIN_ENT)
+                       viodev->cmo.desired = VIO_CMO_MIN_ENT;
+               size = VIO_CMO_MIN_ENT;
+
+               dev_ent = kmalloc(sizeof(struct vio_cmo_dev_entry),
+                                 GFP_KERNEL);
+               if (!dev_ent)
+                       return -ENOMEM;
+
+               dev_ent->viodev = viodev;
+               spin_lock_irqsave(&vio_cmo.lock, flags);
+               list_add(&dev_ent->list, &vio_cmo.device_list);
+       } else {
+               viodev->cmo.desired = 0;
+               size = 0;
+               spin_lock_irqsave(&vio_cmo.lock, flags);
+       }
+
+       /*
+        * If the needs for vio_cmo.min have not changed since they
+        * were last set, the number of devices in the OF tree has
+        * been constant and the IO memory for this is already in
+        * the reserve pool.
+        */
+       if (vio_cmo.min == ((vio_cmo_num_OF_devs() + 1) *
+                           VIO_CMO_MIN_ENT)) {
+               /* Updated desired entitlement if device requires it */
+               if (size)
+                       vio_cmo.desired += (viodev->cmo.desired -
+                                       VIO_CMO_MIN_ENT);
+       } else {
+               size_t tmp;
+
+               tmp = vio_cmo.spare + vio_cmo.excess.free;
+               if (tmp < size) {
+                       dev_err(dev, "%s: insufficient free "
+                               "entitlement to add device. "
+                               "Need %lu, have %lu\n", __func__,
+                               size, (vio_cmo.spare + tmp));
+                       spin_unlock_irqrestore(&vio_cmo.lock, flags);
+                       return -ENOMEM;
+               }
+
+               /* Use excess pool first to fulfill request */
+               tmp = min(size, vio_cmo.excess.free);
+               vio_cmo.excess.free -= tmp;
+               vio_cmo.excess.size -= tmp;
+               vio_cmo.reserve.size += tmp;
+
+               /* Use spare if excess pool was insufficient */
+               vio_cmo.spare -= size - tmp;
+
+               /* Update bus accounting */
+               vio_cmo.min += size;
+               vio_cmo.desired += viodev->cmo.desired;
+       }
+       spin_unlock_irqrestore(&vio_cmo.lock, flags);
+       return 0;
+}
+
+/**
+ * vio_cmo_bus_remove - Handle CMO specific bus removal activities
+ *
+ * @viodev - Pointer to struct vio_dev for device
+ *
+ * Remove the device from the cmo device list.  The minimum entitlement
+ * will be reserved for the device as long as it is in the system.  The
+ * rest of the entitlement the device had been allocated will be returned
+ * to the system.
+ */
+static void vio_cmo_bus_remove(struct vio_dev *viodev)
+{
+       struct vio_cmo_dev_entry *dev_ent;
+       unsigned long flags;
+       size_t tmp;
+
+       spin_lock_irqsave(&vio_cmo.lock, flags);
+       if (viodev->cmo.allocated) {
+               dev_err(&viodev->dev, "%s: device had %lu bytes of IO "
+                       "allocated after remove operation.\n",
+                       __func__, viodev->cmo.allocated);
+               BUG();
+       }
+
+       /*
+        * Remove the device from the device list being maintained for
+        * CMO enabled devices.
+        */
+       list_for_each_entry(dev_ent, &vio_cmo.device_list, list)
+               if (viodev == dev_ent->viodev) {
+                       list_del(&dev_ent->list);
+                       kfree(dev_ent);
+                       break;
+               }
+
+       /*
+        * Devices may not require any entitlement and they do not need
+        * to be processed.  Otherwise, return the device's entitlement
+        * back to the pools.
+        */
+       if (viodev->cmo.entitled) {
+               /*
+                * This device has not yet left the OF tree, it's
+                * minimum entitlement remains in vio_cmo.min and
+                * vio_cmo.desired
+                */
+               vio_cmo.desired -= (viodev->cmo.desired - VIO_CMO_MIN_ENT);
+
+               /*
+                * Save min allocation for device in reserve as long
+                * as it exists in OF tree as determined by later
+                * balance operation
+                */
+               viodev->cmo.entitled -= VIO_CMO_MIN_ENT;
+
+               /* Replenish spare from freed reserve pool */
+               if (viodev->cmo.entitled && (vio_cmo.spare < VIO_CMO_MIN_ENT)) {
+                       tmp = min(viodev->cmo.entitled, (VIO_CMO_MIN_ENT -
+                                                        vio_cmo.spare));
+                       vio_cmo.spare += tmp;
+                       viodev->cmo.entitled -= tmp;
+               }
+
+               /* Remaining reserve goes to excess pool */
+               vio_cmo.excess.size += viodev->cmo.entitled;
+               vio_cmo.excess.free += viodev->cmo.entitled;
+               vio_cmo.reserve.size -= viodev->cmo.entitled;
+
+               /*
+                * Until the device is removed it will keep a
+                * minimum entitlement; this will guarantee that
+                * a module unload/load will result in a success.
+                */
+               viodev->cmo.entitled = VIO_CMO_MIN_ENT;
+               viodev->cmo.desired = VIO_CMO_MIN_ENT;
+               atomic_set(&viodev->cmo.allocs_failed, 0);
+       }
+
+       spin_unlock_irqrestore(&vio_cmo.lock, flags);
+}
+
+static void vio_cmo_set_dma_ops(struct vio_dev *viodev)
+{
+       set_dma_ops(&viodev->dev, &vio_dma_mapping_ops);
+}
+
+/**
+ * vio_cmo_bus_init - CMO entitlement initialization at bus init time
+ *
+ * Set up the reserve and excess entitlement pools based on available
+ * system entitlement and the number of devices in the OF tree that
+ * require entitlement in the reserve pool.
+ */
+static void vio_cmo_bus_init(void)
+{
+       struct hvcall_mpp_data mpp_data;
+       int err;
+
+       memset(&vio_cmo, 0, sizeof(struct vio_cmo));
+       spin_lock_init(&vio_cmo.lock);
+       INIT_LIST_HEAD(&vio_cmo.device_list);
+       INIT_DELAYED_WORK(&vio_cmo.balance_q, vio_cmo_balance);
+
+       /* Get current system entitlement */
+       err = h_get_mpp(&mpp_data);
+
+       /*
+        * On failure, continue with entitlement set to 0, will panic()
+        * later when spare is reserved.
+        */
+       if (err != H_SUCCESS) {
+               printk(KERN_ERR "%s: unable to determine system IO "\
+                      "entitlement. (%d)\n", __func__, err);
+               vio_cmo.entitled = 0;
+       } else {
+               vio_cmo.entitled = mpp_data.entitled_mem;
+       }
+
+       /* Set reservation and check against entitlement */
+       vio_cmo.spare = VIO_CMO_MIN_ENT;
+       vio_cmo.reserve.size = vio_cmo.spare;
+       vio_cmo.reserve.size += (vio_cmo_num_OF_devs() *
+                                VIO_CMO_MIN_ENT);
+       if (vio_cmo.reserve.size > vio_cmo.entitled) {
+               printk(KERN_ERR "%s: insufficient system entitlement\n",
+                      __func__);
+               panic("%s: Insufficient system entitlement", __func__);
+       }
+
+       /* Set the remaining accounting variables */
+       vio_cmo.excess.size = vio_cmo.entitled - vio_cmo.reserve.size;
+       vio_cmo.excess.free = vio_cmo.excess.size;
+       vio_cmo.min = vio_cmo.reserve.size;
+       vio_cmo.desired = vio_cmo.reserve.size;
+}
+
+/* sysfs device functions and data structures for CMO */
+
+#define viodev_cmo_rd_attr(name)                                        \
+static ssize_t viodev_cmo_##name##_show(struct device *dev,             \
+                                        struct device_attribute *attr,  \
+                                         char *buf)                     \
+{                                                                       \
+       return sprintf(buf, "%lu\n", to_vio_dev(dev)->cmo.name);        \
+}
+
+static ssize_t viodev_cmo_allocs_failed_show(struct device *dev,
+               struct device_attribute *attr, char *buf)
+{
+       struct vio_dev *viodev = to_vio_dev(dev);
+       return sprintf(buf, "%d\n", atomic_read(&viodev->cmo.allocs_failed));
+}
+
+static ssize_t viodev_cmo_allocs_failed_reset(struct device *dev,
+               struct device_attribute *attr, const char *buf, size_t count)
+{
+       struct vio_dev *viodev = to_vio_dev(dev);
+       atomic_set(&viodev->cmo.allocs_failed, 0);
+       return count;
+}
+
+static ssize_t viodev_cmo_desired_set(struct device *dev,
+               struct device_attribute *attr, const char *buf, size_t count)
+{
+       struct vio_dev *viodev = to_vio_dev(dev);
+       size_t new_desired;
+       int ret;
+
+       ret = kstrtoul(buf, 10, &new_desired);
+       if (ret)
+               return ret;
+
+       vio_cmo_set_dev_desired(viodev, new_desired);
+       return count;
+}
+
+viodev_cmo_rd_attr(desired);
+viodev_cmo_rd_attr(entitled);
+viodev_cmo_rd_attr(allocated);
+
+static ssize_t name_show(struct device *, struct device_attribute *, char *);
+static ssize_t devspec_show(struct device *, struct device_attribute *, char *);
+static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
+                            char *buf);
+static struct device_attribute vio_cmo_dev_attrs[] = {
+       __ATTR_RO(name),
+       __ATTR_RO(devspec),
+       __ATTR_RO(modalias),
+       __ATTR(cmo_desired,       S_IWUSR|S_IRUSR|S_IWGRP|S_IRGRP|S_IROTH,
+              viodev_cmo_desired_show, viodev_cmo_desired_set),
+       __ATTR(cmo_entitled,      S_IRUGO, viodev_cmo_entitled_show,      NULL),
+       __ATTR(cmo_allocated,     S_IRUGO, viodev_cmo_allocated_show,     NULL),
+       __ATTR(cmo_allocs_failed, S_IWUSR|S_IRUSR|S_IWGRP|S_IRGRP|S_IROTH,
+              viodev_cmo_allocs_failed_show, viodev_cmo_allocs_failed_reset),
+       __ATTR_NULL
+};
+
+/* sysfs bus functions and data structures for CMO */
+
+#define viobus_cmo_rd_attr(name)                                        \
+static ssize_t cmo_##name##_show(struct bus_type *bt, char *buf)        \
+{                                                                       \
+       return sprintf(buf, "%lu\n", vio_cmo.name);                     \
+}                                                                       \
+static BUS_ATTR_RO(cmo_##name)
+
+#define viobus_cmo_pool_rd_attr(name, var)                              \
+static ssize_t                                                          \
+cmo_##name##_##var##_show(struct bus_type *bt, char *buf)               \
+{                                                                       \
+       return sprintf(buf, "%lu\n", vio_cmo.name.var);                 \
+}                                                                       \
+static BUS_ATTR_RO(cmo_##name##_##var)
+
+viobus_cmo_rd_attr(entitled);
+viobus_cmo_rd_attr(spare);
+viobus_cmo_rd_attr(min);
+viobus_cmo_rd_attr(desired);
+viobus_cmo_rd_attr(curr);
+viobus_cmo_pool_rd_attr(reserve, size);
+viobus_cmo_pool_rd_attr(excess, size);
+viobus_cmo_pool_rd_attr(excess, free);
+
+static ssize_t cmo_high_show(struct bus_type *bt, char *buf)
+{
+       return sprintf(buf, "%lu\n", vio_cmo.high);
+}
+
+static ssize_t cmo_high_store(struct bus_type *bt, const char *buf,
+                             size_t count)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&vio_cmo.lock, flags);
+       vio_cmo.high = vio_cmo.curr;
+       spin_unlock_irqrestore(&vio_cmo.lock, flags);
+
+       return count;
+}
+static BUS_ATTR_RW(cmo_high);
+
+static struct attribute *vio_bus_attrs[] = {
+       &bus_attr_cmo_entitled.attr,
+       &bus_attr_cmo_spare.attr,
+       &bus_attr_cmo_min.attr,
+       &bus_attr_cmo_desired.attr,
+       &bus_attr_cmo_curr.attr,
+       &bus_attr_cmo_high.attr,
+       &bus_attr_cmo_reserve_size.attr,
+       &bus_attr_cmo_excess_size.attr,
+       &bus_attr_cmo_excess_free.attr,
+       NULL,
+};
+ATTRIBUTE_GROUPS(vio_bus);
+
+static void vio_cmo_sysfs_init(void)
+{
+       vio_bus_type.dev_attrs = vio_cmo_dev_attrs;
+       vio_bus_type.bus_groups = vio_bus_groups;
+}
+#else /* CONFIG_PPC_SMLPAR */
+int vio_cmo_entitlement_update(size_t new_entitlement) { return 0; }
+void vio_cmo_set_dev_desired(struct vio_dev *viodev, size_t desired) {}
+static int vio_cmo_bus_probe(struct vio_dev *viodev) { return 0; }
+static void vio_cmo_bus_remove(struct vio_dev *viodev) {}
+static void vio_cmo_set_dma_ops(struct vio_dev *viodev) {}
+static void vio_cmo_bus_init(void) {}
+static void vio_cmo_sysfs_init(void) { }
+#endif /* CONFIG_PPC_SMLPAR */
+EXPORT_SYMBOL(vio_cmo_entitlement_update);
+EXPORT_SYMBOL(vio_cmo_set_dev_desired);
+
+
+/*
+ * Platform Facilities Option (PFO) support
+ */
+
+/**
+ * vio_h_cop_sync - Perform a synchronous PFO co-processor operation
+ *
+ * @vdev - Pointer to a struct vio_dev for device
+ * @op - Pointer to a struct vio_pfo_op for the operation parameters
+ *
+ * Calls the hypervisor to synchronously perform the PFO operation
+ * described in @op.  In the case of a busy response from the hypervisor,
+ * the operation will be re-submitted indefinitely unless a non-zero timeout
+ * is specified or an error occurs. The timeout places a limit on when to
+ * stop re-submitting a operation, the total time can be exceeded if an
+ * operation is in progress.
+ *
+ * If op->hcall_ret is not NULL, this will be set to the return from the
+ * last h_cop_op call or it will be 0 if an error not involving the h_call
+ * was encountered.
+ *
+ * Returns:
+ *     0 on success,
+ *     -EINVAL if the h_call fails due to an invalid parameter,
+ *     -E2BIG if the h_call can not be performed synchronously,
+ *     -EBUSY if a timeout is specified and has elapsed,
+ *     -EACCES if the memory area for data/status has been rescinded, or
+ *     -EPERM if a hardware fault has been indicated
+ */
+int vio_h_cop_sync(struct vio_dev *vdev, struct vio_pfo_op *op)
+{
+       struct device *dev = &vdev->dev;
+       unsigned long deadline = 0;
+       long hret = 0;
+       int ret = 0;
+
+       if (op->timeout)
+               deadline = jiffies + msecs_to_jiffies(op->timeout);
+
+       while (true) {
+               hret = plpar_hcall_norets(H_COP, op->flags,
+                               vdev->resource_id,
+                               op->in, op->inlen, op->out,
+                               op->outlen, op->csbcpb);
+
+               if (hret == H_SUCCESS ||
+                   (hret != H_NOT_ENOUGH_RESOURCES &&
+                    hret != H_BUSY && hret != H_RESOURCE) ||
+                   (op->timeout && time_after(deadline, jiffies)))
+                       break;
+
+               dev_dbg(dev, "%s: hcall ret(%ld), retrying.\n", __func__, hret);
+       }
+
+       switch (hret) {
+       case H_SUCCESS:
+               ret = 0;
+               break;
+       case H_OP_MODE:
+       case H_TOO_BIG:
+               ret = -E2BIG;
+               break;
+       case H_RESCINDED:
+               ret = -EACCES;
+               break;
+       case H_HARDWARE:
+               ret = -EPERM;
+               break;
+       case H_NOT_ENOUGH_RESOURCES:
+       case H_RESOURCE:
+       case H_BUSY:
+               ret = -EBUSY;
+               break;
+       default:
+               ret = -EINVAL;
+               break;
+       }
+
+       if (ret)
+               dev_dbg(dev, "%s: Sync h_cop_op failure (ret:%d) (hret:%ld)\n",
+                               __func__, ret, hret);
+
+       op->hcall_err = hret;
+       return ret;
+}
+EXPORT_SYMBOL(vio_h_cop_sync);
+
+static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev)
+{
+       const __be32 *dma_window;
+       struct iommu_table *tbl;
+       unsigned long offset, size;
+
+       dma_window = of_get_property(dev->dev.of_node,
+                                 "ibm,my-dma-window", NULL);
+       if (!dma_window)
+               return NULL;
+
+       tbl = kzalloc(sizeof(*tbl), GFP_KERNEL);
+       if (tbl == NULL)
+               return NULL;
+
+       of_parse_dma_window(dev->dev.of_node, dma_window,
+                           &tbl->it_index, &offset, &size);
+
+       /* TCE table size - measured in tce entries */
+       tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K;
+       tbl->it_size = size >> tbl->it_page_shift;
+       /* offset for VIO should always be 0 */
+       tbl->it_offset = offset >> tbl->it_page_shift;
+       tbl->it_busno = 0;
+       tbl->it_type = TCE_VB;
+       tbl->it_blocksize = 16;
+
+       if (firmware_has_feature(FW_FEATURE_LPAR))
+               tbl->it_ops = &iommu_table_lpar_multi_ops;
+       else
+               tbl->it_ops = &iommu_table_pseries_ops;
+
+       return iommu_init_table(tbl, -1);
+}
+
+/**
+ * vio_match_device: - Tell if a VIO device has a matching
+ *                     VIO device id structure.
+ * @ids:       array of VIO device id structures to search in
+ * @dev:       the VIO device structure to match against
+ *
+ * Used by a driver to check whether a VIO device present in the
+ * system is in its list of supported devices. Returns the matching
+ * vio_device_id structure or NULL if there is no match.
+ */
+static const struct vio_device_id *vio_match_device(
+               const struct vio_device_id *ids, const struct vio_dev *dev)
+{
+       while (ids->type[0] != '\0') {
+               if ((strncmp(dev->type, ids->type, strlen(ids->type)) == 0) &&
+                   of_device_is_compatible(dev->dev.of_node,
+                                        ids->compat))
+                       return ids;
+               ids++;
+       }
+       return NULL;
+}
+
+/*
+ * Convert from struct device to struct vio_dev and pass to driver.
+ * dev->driver has already been set by generic code because vio_bus_match
+ * succeeded.
+ */
+static int vio_bus_probe(struct device *dev)
+{
+       struct vio_dev *viodev = to_vio_dev(dev);
+       struct vio_driver *viodrv = to_vio_driver(dev->driver);
+       const struct vio_device_id *id;
+       int error = -ENODEV;
+
+       if (!viodrv->probe)
+               return error;
+
+       id = vio_match_device(viodrv->id_table, viodev);
+       if (id) {
+               memset(&viodev->cmo, 0, sizeof(viodev->cmo));
+               if (firmware_has_feature(FW_FEATURE_CMO)) {
+                       error = vio_cmo_bus_probe(viodev);
+                       if (error)
+                               return error;
+               }
+               error = viodrv->probe(viodev, id);
+               if (error && firmware_has_feature(FW_FEATURE_CMO))
+                       vio_cmo_bus_remove(viodev);
+       }
+
+       return error;
+}
+
+/* convert from struct device to struct vio_dev and pass to driver. */
+static int vio_bus_remove(struct device *dev)
+{
+       struct vio_dev *viodev = to_vio_dev(dev);
+       struct vio_driver *viodrv = to_vio_driver(dev->driver);
+       struct device *devptr;
+       int ret = 1;
+
+       /*
+        * Hold a reference to the device after the remove function is called
+        * to allow for CMO accounting cleanup for the device.
+        */
+       devptr = get_device(dev);
+
+       if (viodrv->remove)
+               ret = viodrv->remove(viodev);
+
+       if (!ret && firmware_has_feature(FW_FEATURE_CMO))
+               vio_cmo_bus_remove(viodev);
+
+       put_device(devptr);
+       return ret;
+}
+
+/**
+ * vio_register_driver: - Register a new vio driver
+ * @viodrv:    The vio_driver structure to be registered.
+ */
+int __vio_register_driver(struct vio_driver *viodrv, struct module *owner,
+                         const char *mod_name)
+{
+       pr_debug("%s: driver %s registering\n", __func__, viodrv->name);
+
+       /* fill in 'struct driver' fields */
+       viodrv->driver.name = viodrv->name;
+       viodrv->driver.pm = viodrv->pm;
+       viodrv->driver.bus = &vio_bus_type;
+       viodrv->driver.owner = owner;
+       viodrv->driver.mod_name = mod_name;
+
+       return driver_register(&viodrv->driver);
+}
+EXPORT_SYMBOL(__vio_register_driver);
+
+/**
+ * vio_unregister_driver - Remove registration of vio driver.
+ * @viodrv:    The vio_driver struct to be removed form registration
+ */
+void vio_unregister_driver(struct vio_driver *viodrv)
+{
+       driver_unregister(&viodrv->driver);
+}
+EXPORT_SYMBOL(vio_unregister_driver);
+
+/* vio_dev refcount hit 0 */
+static void vio_dev_release(struct device *dev)
+{
+       struct iommu_table *tbl = get_iommu_table_base(dev);
+
+       if (tbl)
+               iommu_free_table(tbl, of_node_full_name(dev->of_node));
+       of_node_put(dev->of_node);
+       kfree(to_vio_dev(dev));
+}
+
+/**
+ * vio_register_device_node: - Register a new vio device.
+ * @of_node:   The OF node for this device.
+ *
+ * Creates and initializes a vio_dev structure from the data in
+ * of_node and adds it to the list of virtual devices.
+ * Returns a pointer to the created vio_dev or NULL if node has
+ * NULL device_type or compatible fields.
+ */
+struct vio_dev *vio_register_device_node(struct device_node *of_node)
+{
+       struct vio_dev *viodev;
+       struct device_node *parent_node;
+       const __be32 *prop;
+       enum vio_dev_family family;
+       const char *of_node_name = of_node->name ? of_node->name : "<unknown>";
+
+       /*
+        * Determine if this node is a under the /vdevice node or under the
+        * /ibm,platform-facilities node.  This decides the device's family.
+        */
+       parent_node = of_get_parent(of_node);
+       if (parent_node) {
+               if (!strcmp(parent_node->full_name, "/ibm,platform-facilities"))
+                       family = PFO;
+               else if (!strcmp(parent_node->full_name, "/vdevice"))
+                       family = VDEVICE;
+               else {
+                       pr_warn("%s: parent(%s) of %s not recognized.\n",
+                                       __func__,
+                                       parent_node->full_name,
+                                       of_node_name);
+                       of_node_put(parent_node);
+                       return NULL;
+               }
+               of_node_put(parent_node);
+       } else {
+               pr_warn("%s: could not determine the parent of node %s.\n",
+                               __func__, of_node_name);
+               return NULL;
+       }
+
+       if (family == PFO) {
+               if (of_get_property(of_node, "interrupt-controller", NULL)) {
+                       pr_debug("%s: Skipping the interrupt controller %s.\n",
+                                       __func__, of_node_name);
+                       return NULL;
+               }
+       }
+
+       /* allocate a vio_dev for this node */
+       viodev = kzalloc(sizeof(struct vio_dev), GFP_KERNEL);
+       if (viodev == NULL) {
+               pr_warn("%s: allocation failure for VIO device.\n", __func__);
+               return NULL;
+       }
+
+       /* we need the 'device_type' property, in order to match with drivers */
+       viodev->family = family;
+       if (viodev->family == VDEVICE) {
+               unsigned int unit_address;
+
+               if (of_node->type != NULL)
+                       viodev->type = of_node->type;
+               else {
+                       pr_warn("%s: node %s is missing the 'device_type' "
+                                       "property.\n", __func__, of_node_name);
+                       goto out;
+               }
+
+               prop = of_get_property(of_node, "reg", NULL);
+               if (prop == NULL) {
+                       pr_warn("%s: node %s missing 'reg'\n",
+                                       __func__, of_node_name);
+                       goto out;
+               }
+               unit_address = of_read_number(prop, 1);
+               dev_set_name(&viodev->dev, "%x", unit_address);
+               viodev->irq = irq_of_parse_and_map(of_node, 0);
+               viodev->unit_address = unit_address;
+       } else {
+               /* PFO devices need their resource_id for submitting COP_OPs
+                * This is an optional field for devices, but is required when
+                * performing synchronous ops */
+               prop = of_get_property(of_node, "ibm,resource-id", NULL);
+               if (prop != NULL)
+                       viodev->resource_id = of_read_number(prop, 1);
+
+               dev_set_name(&viodev->dev, "%s", of_node_name);
+               viodev->type = of_node_name;
+               viodev->irq = 0;
+       }
+
+       viodev->name = of_node->name;
+       viodev->dev.of_node = of_node_get(of_node);
+
+       set_dev_node(&viodev->dev, of_node_to_nid(of_node));
+
+       /* init generic 'struct device' fields: */
+       viodev->dev.parent = &vio_bus_device.dev;
+       viodev->dev.bus = &vio_bus_type;
+       viodev->dev.release = vio_dev_release;
+
+       if (of_get_property(viodev->dev.of_node, "ibm,my-dma-window", NULL)) {
+               if (firmware_has_feature(FW_FEATURE_CMO))
+                       vio_cmo_set_dma_ops(viodev);
+               else
+                       set_dma_ops(&viodev->dev, &dma_iommu_ops);
+
+               set_iommu_table_base(&viodev->dev,
+                                    vio_build_iommu_table(viodev));
+
+               /* needed to ensure proper operation of coherent allocations
+                * later, in case driver doesn't set it explicitly */
+               viodev->dev.coherent_dma_mask = DMA_BIT_MASK(64);
+               viodev->dev.dma_mask = &viodev->dev.coherent_dma_mask;
+       }
+
+       /* register with generic device framework */
+       if (device_register(&viodev->dev)) {
+               printk(KERN_ERR "%s: failed to register device %s\n",
+                               __func__, dev_name(&viodev->dev));
+               put_device(&viodev->dev);
+               return NULL;
+       }
+
+       return viodev;
+
+out:   /* Use this exit point for any return prior to device_register */
+       kfree(viodev);
+
+       return NULL;
+}
+EXPORT_SYMBOL(vio_register_device_node);
+
+/*
+ * vio_bus_scan_for_devices - Scan OF and register each child device
+ * @root_name - OF node name for the root of the subtree to search.
+ *             This must be non-NULL
+ *
+ * Starting from the root node provide, register the device node for
+ * each child beneath the root.
+ */
+static void vio_bus_scan_register_devices(char *root_name)
+{
+       struct device_node *node_root, *node_child;
+
+       if (!root_name)
+               return;
+
+       node_root = of_find_node_by_name(NULL, root_name);
+       if (node_root) {
+
+               /*
+                * Create struct vio_devices for each virtual device in
+                * the device tree. Drivers will associate with them later.
+                */
+               node_child = of_get_next_child(node_root, NULL);
+               while (node_child) {
+                       vio_register_device_node(node_child);
+                       node_child = of_get_next_child(node_root, node_child);
+               }
+               of_node_put(node_root);
+       }
+}
+
+/**
+ * vio_bus_init: - Initialize the virtual IO bus
+ */
+static int __init vio_bus_init(void)
+{
+       int err;
+
+       if (firmware_has_feature(FW_FEATURE_CMO))
+               vio_cmo_sysfs_init();
+
+       err = bus_register(&vio_bus_type);
+       if (err) {
+               printk(KERN_ERR "failed to register VIO bus\n");
+               return err;
+       }
+
+       /*
+        * The fake parent of all vio devices, just to give us
+        * a nice directory
+        */
+       err = device_register(&vio_bus_device.dev);
+       if (err) {
+               printk(KERN_WARNING "%s: device_register returned %i\n",
+                               __func__, err);
+               return err;
+       }
+
+       if (firmware_has_feature(FW_FEATURE_CMO))
+               vio_cmo_bus_init();
+
+       return 0;
+}
+postcore_initcall(vio_bus_init);
+
+static int __init vio_device_init(void)
+{
+       vio_bus_scan_register_devices("vdevice");
+       vio_bus_scan_register_devices("ibm,platform-facilities");
+
+       return 0;
+}
+device_initcall(vio_device_init);
+
+static ssize_t name_show(struct device *dev,
+               struct device_attribute *attr, char *buf)
+{
+       return sprintf(buf, "%s\n", to_vio_dev(dev)->name);
+}
+
+static ssize_t devspec_show(struct device *dev,
+               struct device_attribute *attr, char *buf)
+{
+       struct device_node *of_node = dev->of_node;
+
+       return sprintf(buf, "%s\n", of_node_full_name(of_node));
+}
+
+static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
+                            char *buf)
+{
+       const struct vio_dev *vio_dev = to_vio_dev(dev);
+       struct device_node *dn;
+       const char *cp;
+
+       dn = dev->of_node;
+       if (!dn) {
+               strcpy(buf, "\n");
+               return strlen(buf);
+       }
+       cp = of_get_property(dn, "compatible", NULL);
+       if (!cp) {
+               strcpy(buf, "\n");
+               return strlen(buf);
+       }
+
+       return sprintf(buf, "vio:T%sS%s\n", vio_dev->type, cp);
+}
+
+static struct device_attribute vio_dev_attrs[] = {
+       __ATTR_RO(name),
+       __ATTR_RO(devspec),
+       __ATTR_RO(modalias),
+       __ATTR_NULL
+};
+
+void vio_unregister_device(struct vio_dev *viodev)
+{
+       device_unregister(&viodev->dev);
+}
+EXPORT_SYMBOL(vio_unregister_device);
+
+static int vio_bus_match(struct device *dev, struct device_driver *drv)
+{
+       const struct vio_dev *vio_dev = to_vio_dev(dev);
+       struct vio_driver *vio_drv = to_vio_driver(drv);
+       const struct vio_device_id *ids = vio_drv->id_table;
+
+       return (ids != NULL) && (vio_match_device(ids, vio_dev) != NULL);
+}
+
+static int vio_hotplug(struct device *dev, struct kobj_uevent_env *env)
+{
+       const struct vio_dev *vio_dev = to_vio_dev(dev);
+       struct device_node *dn;
+       const char *cp;
+
+       dn = dev->of_node;
+       if (!dn)
+               return -ENODEV;
+       cp = of_get_property(dn, "compatible", NULL);
+       if (!cp)
+               return -ENODEV;
+
+       add_uevent_var(env, "MODALIAS=vio:T%sS%s", vio_dev->type, cp);
+       return 0;
+}
+
+struct bus_type vio_bus_type = {
+       .name = "vio",
+       .dev_attrs = vio_dev_attrs,
+       .uevent = vio_hotplug,
+       .match = vio_bus_match,
+       .probe = vio_bus_probe,
+       .remove = vio_bus_remove,
+};
+
+/**
+ * vio_get_attribute: - get attribute for virtual device
+ * @vdev:      The vio device to get property.
+ * @which:     The property/attribute to be extracted.
+ * @length:    Pointer to length of returned data size (unused if NULL).
+ *
+ * Calls prom.c's of_get_property() to return the value of the
+ * attribute specified by @which
+*/
+const void *vio_get_attribute(struct vio_dev *vdev, char *which, int *length)
+{
+       return of_get_property(vdev->dev.of_node, which, length);
+}
+EXPORT_SYMBOL(vio_get_attribute);
+
+#ifdef CONFIG_PPC_PSERIES
+/* vio_find_name() - internal because only vio.c knows how we formatted the
+ * kobject name
+ */
+static struct vio_dev *vio_find_name(const char *name)
+{
+       struct device *found;
+
+       found = bus_find_device_by_name(&vio_bus_type, NULL, name);
+       if (!found)
+               return NULL;
+
+       return to_vio_dev(found);
+}
+
+/**
+ * vio_find_node - find an already-registered vio_dev
+ * @vnode: device_node of the virtual device we're looking for
+ *
+ * Takes a reference to the embedded struct device which needs to be dropped
+ * after use.
+ */
+struct vio_dev *vio_find_node(struct device_node *vnode)
+{
+       char kobj_name[20];
+       struct device_node *vnode_parent;
+       const char *dev_type;
+
+       vnode_parent = of_get_parent(vnode);
+       if (!vnode_parent)
+               return NULL;
+
+       dev_type = of_get_property(vnode_parent, "device_type", NULL);
+       of_node_put(vnode_parent);
+       if (!dev_type)
+               return NULL;
+
+       /* construct the kobject name from the device node */
+       if (!strcmp(dev_type, "vdevice")) {
+               const __be32 *prop;
+               
+               prop = of_get_property(vnode, "reg", NULL);
+               if (!prop)
+                       return NULL;
+               snprintf(kobj_name, sizeof(kobj_name), "%x",
+                        (uint32_t)of_read_number(prop, 1));
+       } else if (!strcmp(dev_type, "ibm,platform-facilities"))
+               snprintf(kobj_name, sizeof(kobj_name), "%s", vnode->name);
+       else
+               return NULL;
+
+       return vio_find_name(kobj_name);
+}
+EXPORT_SYMBOL(vio_find_node);
+
+int vio_enable_interrupts(struct vio_dev *dev)
+{
+       int rc = h_vio_signal(dev->unit_address, VIO_IRQ_ENABLE);
+       if (rc != H_SUCCESS)
+               printk(KERN_ERR "vio: Error 0x%x enabling interrupts\n", rc);
+       return rc;
+}
+EXPORT_SYMBOL(vio_enable_interrupts);
+
+int vio_disable_interrupts(struct vio_dev *dev)
+{
+       int rc = h_vio_signal(dev->unit_address, VIO_IRQ_DISABLE);
+       if (rc != H_SUCCESS)
+               printk(KERN_ERR "vio: Error 0x%x disabling interrupts\n", rc);
+       return rc;
+}
+EXPORT_SYMBOL(vio_disable_interrupts);
+#endif /* CONFIG_PPC_PSERIES */
diff --git a/arch/powerpc/purgatory/.gitignore b/arch/powerpc/purgatory/.gitignore
new file mode 100644 (file)
index 0000000..e9e66f1
--- /dev/null
@@ -0,0 +1,2 @@
+kexec-purgatory.c
+purgatory.ro
diff --git a/arch/powerpc/purgatory/Makefile b/arch/powerpc/purgatory/Makefile
new file mode 100644 (file)
index 0000000..ac8793c
--- /dev/null
@@ -0,0 +1,15 @@
+targets += trampoline.o purgatory.ro kexec-purgatory.c
+
+LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined
+
+$(obj)/purgatory.ro: $(obj)/trampoline.o FORCE
+               $(call if_changed,ld)
+
+CMD_BIN2C = $(objtree)/scripts/basic/bin2c
+quiet_cmd_bin2c = BIN2C   $@
+      cmd_bin2c = $(CMD_BIN2C) kexec_purgatory < $< > $@
+
+$(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE
+       $(call if_changed,bin2c)
+
+obj-y  += kexec-purgatory.o
diff --git a/arch/powerpc/purgatory/trampoline.S b/arch/powerpc/purgatory/trampoline.S
new file mode 100644 (file)
index 0000000..f9760cc
--- /dev/null
@@ -0,0 +1,128 @@
+/*
+ * kexec trampoline
+ *
+ * Based on code taken from kexec-tools and kexec-lite.
+ *
+ * Copyright (C) 2004 - 2005, Milton D Miller II, IBM Corporation
+ * Copyright (C) 2006, Mohan Kumar M, IBM Corporation
+ * Copyright (C) 2013, Anton Blanchard, IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU General Public License as published by the Free
+ * Software Foundation (version 2 of the License).
+ */
+
+#if defined(__LITTLE_ENDIAN__)
+#define STWX_BE        stwbrx
+#define LWZX_BE        lwbrx
+#elif defined(__BIG_ENDIAN__)
+#define STWX_BE        stwx
+#define LWZX_BE        lwzx
+#else
+#error no endianness defined!
+#endif
+
+       .machine ppc64
+       .balign 256
+       .globl purgatory_start
+purgatory_start:
+       b       master
+
+       /* ABI: possible run_at_load flag at 0x5c */
+       .org purgatory_start + 0x5c
+       .globl run_at_load
+run_at_load:
+       .long 0
+       .size run_at_load, . - run_at_load
+
+       /* ABI: slaves start at 60 with r3=phys */
+       .org purgatory_start + 0x60
+slave:
+       b .
+       /* ABI: end of copied region */
+       .org purgatory_start + 0x100
+       .size purgatory_start, . - purgatory_start
+
+/*
+ * The above 0x100 bytes at purgatory_start are replaced with the
+ * code from the kernel (or next stage) by setup_purgatory().
+ */
+
+master:
+       or      %r1,%r1,%r1     /* low priority to let other threads catchup */
+       isync
+       mr      %r17,%r3        /* save cpu id to r17 */
+       mr      %r15,%r4        /* save physical address in reg15 */
+
+       or      %r3,%r3,%r3     /* ok now to high priority, lets boot */
+       lis     %r6,0x1
+       mtctr   %r6             /* delay a bit for slaves to catch up */
+       bdnz    .               /* before we overwrite 0-100 again */
+
+       bl      0f              /* Work out where we're running */
+0:     mflr    %r18
+
+       /* load device-tree address */
+       ld      %r3, (dt_offset - 0b)(%r18)
+       mr      %r16,%r3        /* save dt address in reg16 */
+       li      %r4,20
+       LWZX_BE %r6,%r3,%r4     /* fetch __be32 version number at byte 20 */
+       cmpwi   %r0,%r6,2       /* v2 or later? */
+       blt     1f
+       li      %r4,28
+       STWX_BE %r17,%r3,%r4    /* Store my cpu as __be32 at byte 28 */
+1:
+       /* load the kernel address */
+       ld      %r4,(kernel - 0b)(%r18)
+
+       /* load the run_at_load flag */
+       /* possibly patched by kexec */
+       ld      %r6,(run_at_load - 0b)(%r18)
+       /* and patch it into the kernel */
+       stw     %r6,(0x5c)(%r4)
+
+       mr      %r3,%r16        /* restore dt address */
+
+       li      %r5,0           /* r5 will be 0 for kernel */
+
+       mfmsr   %r11
+       andi.   %r10,%r11,1     /* test MSR_LE */
+       bne     .Little_endian
+
+       mtctr   %r4             /* prepare branch to */
+       bctr                    /* start kernel */
+
+.Little_endian:
+       mtsrr0  %r4             /* prepare branch to */
+
+       clrrdi  %r11,%r11,1     /* clear MSR_LE */
+       mtsrr1  %r11
+
+       rfid                    /* update MSR and start kernel */
+
+
+       .balign 8
+       .globl kernel
+kernel:
+       .llong  0x0
+       .size kernel, . - kernel
+
+       .balign 8
+       .globl dt_offset
+dt_offset:
+       .llong  0x0
+       .size dt_offset, . - dt_offset
+
+
+       .data
+       .balign 8
+.globl sha256_digest
+sha256_digest:
+       .skip   32
+       .size sha256_digest, . - sha256_digest
+
+       .balign 8
+.globl sha_regions
+sha_regions:
+       .skip   8 * 2 * 16
+       .size sha_regions, . - sha_regions
index 760545519a0bc3b7a8988824a9a677ad416286fe..9c0e17cf6886cdea0ad539f1c53ecbd7a66c502b 100644 (file)
@@ -10,6 +10,8 @@
  *      as published by the Free Software Foundation; either version
  *      2 of the License, or (at your option) any later version.
  */
+
+#include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
@@ -225,6 +227,7 @@ Commands:\n\
 #endif
   "\
   dr   dump stream of raw bytes\n\
+  dt   dump the tracing buffers (uses printk)\n\
   e    print exception information\n\
   f    flush cache\n\
   la   lookup symbol+offset of specified address\n\
@@ -2364,6 +2367,9 @@ dump(void)
                dump_log_buf();
        } else if (c == 'o') {
                dump_opal_msglog();
+       } else if (c == 't') {
+               ftrace_dump(DUMP_ALL);
+               tracing_on();
        } else if (c == 'r') {
                scanhex(&ndump);
                if (ndump == 0)
index 650830e39e3a7c8f8e01755f4ecf2bd5f83bb799..3741461c63a0cfc56cbc8cf88e3011260ba82018 100644 (file)
@@ -631,9 +631,9 @@ static int determine_backup_region(u64 start, u64 end, void *arg)
 
 int crash_load_segments(struct kimage *image)
 {
-       unsigned long src_start, src_sz, elf_sz;
-       void *elf_addr;
        int ret;
+       struct kexec_buf kbuf = { .image = image, .buf_min = 0,
+                                 .buf_max = ULONG_MAX, .top_down = false };
 
        /*
         * Determine and load a segment for backup area. First 640K RAM
@@ -647,43 +647,44 @@ int crash_load_segments(struct kimage *image)
        if (ret < 0)
                return ret;
 
-       src_start = image->arch.backup_src_start;
-       src_sz = image->arch.backup_src_sz;
-
        /* Add backup segment. */
-       if (src_sz) {
+       if (image->arch.backup_src_sz) {
+               kbuf.buffer = &crash_zero_bytes;
+               kbuf.bufsz = sizeof(crash_zero_bytes);
+               kbuf.memsz = image->arch.backup_src_sz;
+               kbuf.buf_align = PAGE_SIZE;
                /*
                 * Ideally there is no source for backup segment. This is
                 * copied in purgatory after crash. Just add a zero filled
                 * segment for now to make sure checksum logic works fine.
                 */
-               ret = kexec_add_buffer(image, (char *)&crash_zero_bytes,
-                                      sizeof(crash_zero_bytes), src_sz,
-                                      PAGE_SIZE, 0, -1, 0,
-                                      &image->arch.backup_load_addr);
+               ret = kexec_add_buffer(&kbuf);
                if (ret)
                        return ret;
+               image->arch.backup_load_addr = kbuf.mem;
                pr_debug("Loaded backup region at 0x%lx backup_start=0x%lx memsz=0x%lx\n",
-                        image->arch.backup_load_addr, src_start, src_sz);
+                        image->arch.backup_load_addr,
+                        image->arch.backup_src_start, kbuf.memsz);
        }
 
        /* Prepare elf headers and add a segment */
-       ret = prepare_elf_headers(image, &elf_addr, &elf_sz);
+       ret = prepare_elf_headers(image, &kbuf.buffer, &kbuf.bufsz);
        if (ret)
                return ret;
 
-       image->arch.elf_headers = elf_addr;
-       image->arch.elf_headers_sz = elf_sz;
+       image->arch.elf_headers = kbuf.buffer;
+       image->arch.elf_headers_sz = kbuf.bufsz;
 
-       ret = kexec_add_buffer(image, (char *)elf_addr, elf_sz, elf_sz,
-                       ELF_CORE_HEADER_ALIGN, 0, -1, 0,
-                       &image->arch.elf_load_addr);
+       kbuf.memsz = kbuf.bufsz;
+       kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
+       ret = kexec_add_buffer(&kbuf);
        if (ret) {
                vfree((void *)image->arch.elf_headers);
                return ret;
        }
+       image->arch.elf_load_addr = kbuf.mem;
        pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
-                image->arch.elf_load_addr, elf_sz, elf_sz);
+                image->arch.elf_load_addr, kbuf.bufsz, kbuf.bufsz);
 
        return ret;
 }
index 3407b148c2401342caed9fb57f3e8a0e4c1689e4..d0a814a9d96ac5412b1216060169d53a69133aea 100644 (file)
@@ -331,17 +331,17 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
 
        struct setup_header *header;
        int setup_sects, kern16_size, ret = 0;
-       unsigned long setup_header_size, params_cmdline_sz, params_misc_sz;
+       unsigned long setup_header_size, params_cmdline_sz;
        struct boot_params *params;
        unsigned long bootparam_load_addr, kernel_load_addr, initrd_load_addr;
        unsigned long purgatory_load_addr;
-       unsigned long kernel_bufsz, kernel_memsz, kernel_align;
-       char *kernel_buf;
        struct bzimage64_data *ldata;
        struct kexec_entry64_regs regs64;
        void *stack;
        unsigned int setup_hdr_offset = offsetof(struct boot_params, hdr);
        unsigned int efi_map_offset, efi_map_sz, efi_setup_data_offset;
+       struct kexec_buf kbuf = { .image = image, .buf_max = ULONG_MAX,
+                                 .top_down = true };
 
        header = (struct setup_header *)(kernel + setup_hdr_offset);
        setup_sects = header->setup_sects;
@@ -402,11 +402,11 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
        params_cmdline_sz = sizeof(struct boot_params) + cmdline_len +
                                MAX_ELFCOREHDR_STR_LEN;
        params_cmdline_sz = ALIGN(params_cmdline_sz, 16);
-       params_misc_sz = params_cmdline_sz + efi_map_sz +
+       kbuf.bufsz = params_cmdline_sz + efi_map_sz +
                                sizeof(struct setup_data) +
                                sizeof(struct efi_setup_data);
 
-       params = kzalloc(params_misc_sz, GFP_KERNEL);
+       params = kzalloc(kbuf.bufsz, GFP_KERNEL);
        if (!params)
                return ERR_PTR(-ENOMEM);
        efi_map_offset = params_cmdline_sz;
@@ -418,37 +418,41 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
        /* Is there a limit on setup header size? */
        memcpy(&params->hdr, (kernel + setup_hdr_offset), setup_header_size);
 
-       ret = kexec_add_buffer(image, (char *)params, params_misc_sz,
-                              params_misc_sz, 16, MIN_BOOTPARAM_ADDR,
-                              ULONG_MAX, 1, &bootparam_load_addr);
+       kbuf.buffer = params;
+       kbuf.memsz = kbuf.bufsz;
+       kbuf.buf_align = 16;
+       kbuf.buf_min = MIN_BOOTPARAM_ADDR;
+       ret = kexec_add_buffer(&kbuf);
        if (ret)
                goto out_free_params;
+       bootparam_load_addr = kbuf.mem;
        pr_debug("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
-                bootparam_load_addr, params_misc_sz, params_misc_sz);
+                bootparam_load_addr, kbuf.bufsz, kbuf.bufsz);
 
        /* Load kernel */
-       kernel_buf = kernel + kern16_size;
-       kernel_bufsz =  kernel_len - kern16_size;
-       kernel_memsz = PAGE_ALIGN(header->init_size);
-       kernel_align = header->kernel_alignment;
-
-       ret = kexec_add_buffer(image, kernel_buf,
-                              kernel_bufsz, kernel_memsz, kernel_align,
-                              MIN_KERNEL_LOAD_ADDR, ULONG_MAX, 1,
-                              &kernel_load_addr);
+       kbuf.buffer = kernel + kern16_size;
+       kbuf.bufsz =  kernel_len - kern16_size;
+       kbuf.memsz = PAGE_ALIGN(header->init_size);
+       kbuf.buf_align = header->kernel_alignment;
+       kbuf.buf_min = MIN_KERNEL_LOAD_ADDR;
+       ret = kexec_add_buffer(&kbuf);
        if (ret)
                goto out_free_params;
+       kernel_load_addr = kbuf.mem;
 
        pr_debug("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
-                kernel_load_addr, kernel_memsz, kernel_memsz);
+                kernel_load_addr, kbuf.bufsz, kbuf.memsz);
 
        /* Load initrd high */
        if (initrd) {
-               ret = kexec_add_buffer(image, initrd, initrd_len, initrd_len,
-                                      PAGE_SIZE, MIN_INITRD_LOAD_ADDR,
-                                      ULONG_MAX, 1, &initrd_load_addr);
+               kbuf.buffer = initrd;
+               kbuf.bufsz = kbuf.memsz = initrd_len;
+               kbuf.buf_align = PAGE_SIZE;
+               kbuf.buf_min = MIN_INITRD_LOAD_ADDR;
+               ret = kexec_add_buffer(&kbuf);
                if (ret)
                        goto out_free_params;
+               initrd_load_addr = kbuf.mem;
 
                pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
                                initrd_load_addr, initrd_len, initrd_len);
index 2e5233b6097110e72ae147f21ed15cf259b0a5a9..1b35e33d24341e2013b4fe7aec78dca70f7cc2cd 100644 (file)
 
 #include <linux/pci.h>
 #include <linux/slab.h>
-#include <linux/anon_inodes.h>
 #include <linux/file.h>
 #include <misc/cxl.h>
-#include <linux/fs.h>
 #include <asm/pnv-pci.h>
 #include <linux/msi.h>
+#include <linux/module.h>
+#include <linux/mount.h>
 
 #include "cxl.h"
 
+/*
+ * Since we want to track memory mappings to be able to force-unmap
+ * when the AFU is no longer reachable, we need an inode. For devices
+ * opened through the cxl user API, this is not a problem, but a
+ * userland process can also get a cxl fd through the cxl_get_fd()
+ * API, which is used by the cxlflash driver.
+ *
+ * Therefore we implement our own simple pseudo-filesystem and inode
+ * allocator. We don't use the anonymous inode, as we need the
+ * meta-data associated with it (address_space) and it is shared by
+ * other drivers/processes, so it could lead to cxl unmapping VMAs
+ * from random processes.
+ */
+
+#define CXL_PSEUDO_FS_MAGIC    0x1697697f
+
+static int cxl_fs_cnt;
+static struct vfsmount *cxl_vfs_mount;
+
+static const struct dentry_operations cxl_fs_dops = {
+       .d_dname        = simple_dname,
+};
+
+static struct dentry *cxl_fs_mount(struct file_system_type *fs_type, int flags,
+                               const char *dev_name, void *data)
+{
+       return mount_pseudo(fs_type, "cxl:", NULL, &cxl_fs_dops,
+                       CXL_PSEUDO_FS_MAGIC);
+}
+
+static struct file_system_type cxl_fs_type = {
+       .name           = "cxl",
+       .owner          = THIS_MODULE,
+       .mount          = cxl_fs_mount,
+       .kill_sb        = kill_anon_super,
+};
+
+
+void cxl_release_mapping(struct cxl_context *ctx)
+{
+       if (ctx->kernelapi && ctx->mapping)
+               simple_release_fs(&cxl_vfs_mount, &cxl_fs_cnt);
+}
+
+static struct file *cxl_getfile(const char *name,
+                               const struct file_operations *fops,
+                               void *priv, int flags)
+{
+       struct qstr this;
+       struct path path;
+       struct file *file;
+       struct inode *inode = NULL;
+       int rc;
+
+       /* strongly inspired by anon_inode_getfile() */
+
+       if (fops->owner && !try_module_get(fops->owner))
+               return ERR_PTR(-ENOENT);
+
+       rc = simple_pin_fs(&cxl_fs_type, &cxl_vfs_mount, &cxl_fs_cnt);
+       if (rc < 0) {
+               pr_err("Cannot mount cxl pseudo filesystem: %d\n", rc);
+               file = ERR_PTR(rc);
+               goto err_module;
+       }
+
+       inode = alloc_anon_inode(cxl_vfs_mount->mnt_sb);
+       if (IS_ERR(inode)) {
+               file = ERR_CAST(inode);
+               goto err_fs;
+       }
+
+       file = ERR_PTR(-ENOMEM);
+       this.name = name;
+       this.len = strlen(name);
+       this.hash = 0;
+       path.dentry = d_alloc_pseudo(cxl_vfs_mount->mnt_sb, &this);
+       if (!path.dentry)
+               goto err_inode;
+
+       path.mnt = mntget(cxl_vfs_mount);
+       d_instantiate(path.dentry, inode);
+
+       file = alloc_file(&path, OPEN_FMODE(flags), fops);
+       if (IS_ERR(file))
+               goto err_dput;
+       file->f_flags = flags & (O_ACCMODE | O_NONBLOCK);
+       file->private_data = priv;
+
+       return file;
+
+err_dput:
+       path_put(&path);
+err_inode:
+       iput(inode);
+err_fs:
+       simple_release_fs(&cxl_vfs_mount, &cxl_fs_cnt);
+err_module:
+       module_put(fops->owner);
+       return file;
+}
+
 struct cxl_context *cxl_dev_context_init(struct pci_dev *dev)
 {
-       struct address_space *mapping;
        struct cxl_afu *afu;
        struct cxl_context  *ctx;
        int rc;
@@ -30,38 +131,20 @@ struct cxl_context *cxl_dev_context_init(struct pci_dev *dev)
                return ERR_CAST(afu);
 
        ctx = cxl_context_alloc();
-       if (IS_ERR(ctx)) {
-               rc = PTR_ERR(ctx);
-               goto err_dev;
-       }
+       if (!ctx)
+               return ERR_PTR(-ENOMEM);
 
        ctx->kernelapi = true;
 
-       /*
-        * Make our own address space since we won't have one from the
-        * filesystem like the user api has, and even if we do associate a file
-        * with this context we don't want to use the global anonymous inode's
-        * address space as that can invalidate unrelated users:
-        */
-       mapping = kmalloc(sizeof(struct address_space), GFP_KERNEL);
-       if (!mapping) {
-               rc = -ENOMEM;
-               goto err_ctx;
-       }
-       address_space_init_once(mapping);
-
        /* Make it a slave context.  We can promote it later? */
-       rc = cxl_context_init(ctx, afu, false, mapping);
+       rc = cxl_context_init(ctx, afu, false);
        if (rc)
-               goto err_mapping;
+               goto err_ctx;
 
        return ctx;
 
-err_mapping:
-       kfree(mapping);
 err_ctx:
        kfree(ctx);
-err_dev:
        return ERR_PTR(rc);
 }
 EXPORT_SYMBOL_GPL(cxl_dev_context_init);
@@ -340,6 +423,11 @@ struct file *cxl_get_fd(struct cxl_context *ctx, struct file_operations *fops,
 {
        struct file *file;
        int rc, flags, fdtmp;
+       char *name = NULL;
+
+       /* only allow one per context */
+       if (ctx->mapping)
+               return ERR_PTR(-EEXIST);
 
        flags = O_RDWR | O_CLOEXEC;
 
@@ -363,12 +451,13 @@ struct file *cxl_get_fd(struct cxl_context *ctx, struct file_operations *fops,
        } else /* use default ops */
                fops = (struct file_operations *)&afu_fops;
 
-       file = anon_inode_getfile("cxl", fops, ctx, flags);
+       name = kasprintf(GFP_KERNEL, "cxl:%d", ctx->pe);
+       file = cxl_getfile(name, fops, ctx, flags);
+       kfree(name);
        if (IS_ERR(file))
                goto err_fd;
 
-       file->f_mapping = ctx->mapping;
-
+       cxl_context_set_mapping(ctx, file->f_mapping);
        *fd = fdtmp;
        return file;
 
@@ -541,7 +630,7 @@ int _cxl_cx4_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
 
                if (remaining > 0) {
                        new_ctx = cxl_dev_context_init(pdev);
-                       if (!new_ctx) {
+                       if (IS_ERR(new_ctx)) {
                                pr_warn("%s: Failed to allocate enough contexts for MSIs\n", pci_name(pdev));
                                return -ENOSPC;
                        }
index 5e506c19108ad22da4a002957fd056711138b0f3..ff5e7e8cb1d151f32bc04e813b7ba05f531e04d2 100644 (file)
@@ -34,8 +34,7 @@ struct cxl_context *cxl_context_alloc(void)
 /*
  * Initialises a CXL context.
  */
-int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master,
-                    struct address_space *mapping)
+int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master)
 {
        int i;
 
@@ -44,7 +43,7 @@ int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master,
        ctx->master = master;
        ctx->pid = ctx->glpid = NULL; /* Set in start work ioctl */
        mutex_init(&ctx->mapping_lock);
-       ctx->mapping = mapping;
+       ctx->mapping = NULL;
 
        /*
         * Allocate the segment table before we put it in the IDR so that we
@@ -114,6 +113,14 @@ int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master,
        return 0;
 }
 
+void cxl_context_set_mapping(struct cxl_context *ctx,
+                       struct address_space *mapping)
+{
+       mutex_lock(&ctx->mapping_lock);
+       ctx->mapping = mapping;
+       mutex_unlock(&ctx->mapping_lock);
+}
+
 static int cxl_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct cxl_context *ctx = vma->vm_file->private_data;
@@ -300,8 +307,6 @@ static void reclaim_ctx(struct rcu_head *rcu)
        if (ctx->ff_page)
                __free_page(ctx->ff_page);
        ctx->sstp = NULL;
-       if (ctx->kernelapi)
-               kfree(ctx->mapping);
 
        kfree(ctx->irq_bitmap);
 
@@ -313,6 +318,8 @@ static void reclaim_ctx(struct rcu_head *rcu)
 
 void cxl_context_free(struct cxl_context *ctx)
 {
+       if (ctx->kernelapi && ctx->mapping)
+               cxl_release_mapping(ctx);
        mutex_lock(&ctx->afu->contexts_lock);
        idr_remove(&ctx->afu->contexts_idr, ctx->pe);
        mutex_unlock(&ctx->afu->contexts_lock);
index a144073593fa1e5170bba669d7ba467eb06ada5b..b24d76723fb0900b2588fa0a8f192ca9174ad199 100644 (file)
@@ -817,8 +817,9 @@ void cxl_dump_debug_buffer(void *addr, size_t size);
 void init_cxl_native(void);
 
 struct cxl_context *cxl_context_alloc(void);
-int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master,
-                    struct address_space *mapping);
+int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master);
+void cxl_context_set_mapping(struct cxl_context *ctx,
+                       struct address_space *mapping);
 void cxl_context_free(struct cxl_context *ctx);
 int cxl_context_iomap(struct cxl_context *ctx, struct vm_area_struct *vma);
 unsigned int cxl_map_irq(struct cxl *adapter, irq_hw_number_t hwirq,
@@ -877,6 +878,7 @@ void cxl_native_err_irq_dump_regs(struct cxl *adapter);
 void cxl_stop_trace(struct cxl *cxl);
 int cxl_pci_vphb_add(struct cxl_afu *afu);
 void cxl_pci_vphb_remove(struct cxl_afu *afu);
+void cxl_release_mapping(struct cxl_context *ctx);
 
 extern struct pci_driver cxl_pci_driver;
 extern struct platform_driver cxl_of_driver;
index ec7b8a0174393204667915175d6a01dba48d35ee..9c06ac8fa5acad874975d3d5e8d87c5c6e4fbc4a 100644 (file)
@@ -43,12 +43,14 @@ static int debugfs_io_u64_set(void *data, u64 val)
        out_be64((u64 __iomem *)data, val);
        return 0;
 }
-DEFINE_SIMPLE_ATTRIBUTE(fops_io_x64, debugfs_io_u64_get, debugfs_io_u64_set, "0x%016llx\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_io_x64, debugfs_io_u64_get, debugfs_io_u64_set,
+                        "0x%016llx\n");
 
 static struct dentry *debugfs_create_io_x64(const char *name, umode_t mode,
                                            struct dentry *parent, u64 __iomem *value)
 {
-       return debugfs_create_file(name, mode, parent, (void __force *)value, &fops_io_x64);
+       return debugfs_create_file_unsafe(name, mode, parent,
+                                         (void __force *)value, &fops_io_x64);
 }
 
 void cxl_debugfs_add_adapter_psl_regs(struct cxl *adapter, struct dentry *dir)
index 77080cc5fa0aa4cdbc476729e4cdabcac8afae7b..859959f19f1072ff0a8de823b6ad063423dea93a 100644 (file)
@@ -86,9 +86,12 @@ static int __afu_open(struct inode *inode, struct file *file, bool master)
                goto err_put_afu;
        }
 
-       if ((rc = cxl_context_init(ctx, afu, master, inode->i_mapping)))
+       rc = cxl_context_init(ctx, afu, master);
+       if (rc)
                goto err_put_afu;
 
+       cxl_context_set_mapping(ctx, inode->i_mapping);
+
        pr_devel("afu_open pe: %i\n", ctx->pe);
        file->private_data = ctx;
        cxl_ctx_get();
index 3e102cd6ed914d992152128423cce8cbab33e830..e04bc4ddfd742527e0a649540795c97a25a6cfab 100644 (file)
@@ -887,7 +887,7 @@ static void afu_handle_errstate(struct work_struct *work)
            afu_guest->previous_state == H_STATE_PERM_UNAVAILABLE)
                return;
 
-       if (afu_guest->handle_err == true)
+       if (afu_guest->handle_err)
                schedule_delayed_work(&afu_guest->work_err,
                                      msecs_to_jiffies(3000));
 }
index dec60f58a7677b8999af2a246613e03a19b0d3bc..1a402bbed687fb69817d0cdf2cde58e59d8cc2b5 100644 (file)
@@ -104,7 +104,7 @@ irqreturn_t cxl_irq(int irq, struct cxl_context *ctx, struct cxl_irq_info *irq_i
                } else {
                        spin_lock(&ctx->lock);
                        ctx->afu_err = irq_info->afu_err;
-                       ctx->pending_afu_err = 1;
+                       ctx->pending_afu_err = true;
                        spin_unlock(&ctx->lock);
 
                        wake_up_all(&ctx->wq);
index a217a74ccc980d13f71577e46b151bf3f3397de7..09505f432eda6231e0265769d7214196a5660557 100644 (file)
@@ -10,7 +10,6 @@
 #include <linux/spinlock.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
-#include <linux/sched.h>
 #include <linux/mutex.h>
 #include <linux/mm.h>
 #include <linux/uaccess.h>
@@ -54,7 +53,7 @@ static int afu_control(struct cxl_afu *afu, u64 command, u64 clear,
                                     AFU_Cntl | command);
                cpu_relax();
                AFU_Cntl = cxl_p2n_read(afu, CXL_AFU_Cntl_An);
-       };
+       }
 
        if (AFU_Cntl & CXL_AFU_Cntl_An_RA) {
                /*
@@ -167,7 +166,7 @@ int cxl_psl_purge(struct cxl_afu *afu)
                        cpu_relax();
                }
                PSL_CNTL = cxl_p1n_read(afu, CXL_PSL_SCNTL_An);
-       };
+       }
        end = local_clock();
        pr_devel("PSL purged in %lld ns\n", end - start);
 
@@ -931,9 +930,18 @@ static irqreturn_t native_irq_multiplexed(int irq, void *data)
        struct cxl_afu *afu = data;
        struct cxl_context *ctx;
        struct cxl_irq_info irq_info;
-       int ph = cxl_p2n_read(afu, CXL_PSL_PEHandle_An) & 0xffff;
-       int ret;
-
+       u64 phreg = cxl_p2n_read(afu, CXL_PSL_PEHandle_An);
+       int ph, ret;
+
+       /* check if eeh kicked in while the interrupt was in flight */
+       if (unlikely(phreg == ~0ULL)) {
+               dev_warn(&afu->dev,
+                        "Ignoring slice interrupt(%d) due to fenced card",
+                        irq);
+               return IRQ_HANDLED;
+       }
+       /* Mask the pe-handle from register value */
+       ph = phreg & 0xffff;
        if ((ret = native_get_irq_info(afu, &irq_info))) {
                WARN(1, "Unable to get CXL IRQ Info: %i\n", ret);
                return fail_psl_irq(afu, &irq_info);
index e96be9ca4e60437db6bfba9b098fad852790f722..80a87ab25b83857a77ada244b6159c7b1c55a31a 100644 (file)
@@ -1921,7 +1921,7 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev)
                                goto err;
 
                        ctx = cxl_dev_context_init(afu_dev);
-                       if (!ctx)
+                       if (IS_ERR(ctx))
                                goto err;
 
                        afu_dev->dev.archdata.cxl_ctx = ctx;
index 0935d44c177097f76f309d7084d440123c7201b7..6ec69ada19f480ce62c1763e8b473a923f87951b 100644 (file)
@@ -20,7 +20,7 @@ bool _cxl_pci_associate_default_context(struct pci_dev *dev, struct cxl_afu *afu
         * in the virtual phb, we'll need a default context to attach them to.
         */
        ctx = cxl_dev_context_init(dev);
-       if (!ctx)
+       if (IS_ERR(ctx))
                return false;
        dev->dev.archdata.cxl_ctx = ctx;
 
index 80378ddadc5ca4581bcba2792dc49f56209f2902..c8823578a1b2afd3ae7a36c2f526fd071116876b 100644 (file)
 static void tce_iommu_detach_group(void *iommu_data,
                struct iommu_group *iommu_group);
 
-static long try_increment_locked_vm(long npages)
+static long try_increment_locked_vm(struct mm_struct *mm, long npages)
 {
        long ret = 0, locked, lock_limit;
 
-       if (!current || !current->mm)
-               return -ESRCH; /* process exited */
+       if (WARN_ON_ONCE(!mm))
+               return -EPERM;
 
        if (!npages)
                return 0;
 
-       down_write(&current->mm->mmap_sem);
-       locked = current->mm->locked_vm + npages;
+       down_write(&mm->mmap_sem);
+       locked = mm->locked_vm + npages;
        lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
        if (locked > lock_limit && !capable(CAP_IPC_LOCK))
                ret = -ENOMEM;
        else
-               current->mm->locked_vm += npages;
+               mm->locked_vm += npages;
 
        pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
                        npages << PAGE_SHIFT,
-                       current->mm->locked_vm << PAGE_SHIFT,
+                       mm->locked_vm << PAGE_SHIFT,
                        rlimit(RLIMIT_MEMLOCK),
                        ret ? " - exceeded" : "");
 
-       up_write(&current->mm->mmap_sem);
+       up_write(&mm->mmap_sem);
 
        return ret;
 }
 
-static void decrement_locked_vm(long npages)
+static void decrement_locked_vm(struct mm_struct *mm, long npages)
 {
-       if (!current || !current->mm || !npages)
-               return; /* process exited */
+       if (!mm || !npages)
+               return;
 
-       down_write(&current->mm->mmap_sem);
-       if (WARN_ON_ONCE(npages > current->mm->locked_vm))
-               npages = current->mm->locked_vm;
-       current->mm->locked_vm -= npages;
+       down_write(&mm->mmap_sem);
+       if (WARN_ON_ONCE(npages > mm->locked_vm))
+               npages = mm->locked_vm;
+       mm->locked_vm -= npages;
        pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
                        npages << PAGE_SHIFT,
-                       current->mm->locked_vm << PAGE_SHIFT,
+                       mm->locked_vm << PAGE_SHIFT,
                        rlimit(RLIMIT_MEMLOCK));
-       up_write(&current->mm->mmap_sem);
+       up_write(&mm->mmap_sem);
 }
 
 /*
@@ -88,6 +88,15 @@ struct tce_iommu_group {
        struct iommu_group *grp;
 };
 
+/*
+ * A container needs to remember which preregistered region  it has
+ * referenced to do proper cleanup at the userspace process exit.
+ */
+struct tce_iommu_prereg {
+       struct list_head next;
+       struct mm_iommu_table_group_mem_t *mem;
+};
+
 /*
  * The container descriptor supports only a single group per container.
  * Required by the API as the container is not supplied with the IOMMU group
@@ -97,24 +106,68 @@ struct tce_container {
        struct mutex lock;
        bool enabled;
        bool v2;
+       bool def_window_pending;
        unsigned long locked_pages;
+       struct mm_struct *mm;
        struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
        struct list_head group_list;
+       struct list_head prereg_list;
 };
 
+static long tce_iommu_mm_set(struct tce_container *container)
+{
+       if (container->mm) {
+               if (container->mm == current->mm)
+                       return 0;
+               return -EPERM;
+       }
+       BUG_ON(!current->mm);
+       container->mm = current->mm;
+       atomic_inc(&container->mm->mm_count);
+
+       return 0;
+}
+
+static long tce_iommu_prereg_free(struct tce_container *container,
+               struct tce_iommu_prereg *tcemem)
+{
+       long ret;
+
+       ret = mm_iommu_put(container->mm, tcemem->mem);
+       if (ret)
+               return ret;
+
+       list_del(&tcemem->next);
+       kfree(tcemem);
+
+       return 0;
+}
+
 static long tce_iommu_unregister_pages(struct tce_container *container,
                __u64 vaddr, __u64 size)
 {
        struct mm_iommu_table_group_mem_t *mem;
+       struct tce_iommu_prereg *tcemem;
+       bool found = false;
 
        if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
                return -EINVAL;
 
-       mem = mm_iommu_find(vaddr, size >> PAGE_SHIFT);
+       mem = mm_iommu_find(container->mm, vaddr, size >> PAGE_SHIFT);
        if (!mem)
                return -ENOENT;
 
-       return mm_iommu_put(mem);
+       list_for_each_entry(tcemem, &container->prereg_list, next) {
+               if (tcemem->mem == mem) {
+                       found = true;
+                       break;
+               }
+       }
+
+       if (!found)
+               return -ENOENT;
+
+       return tce_iommu_prereg_free(container, tcemem);
 }
 
 static long tce_iommu_register_pages(struct tce_container *container,
@@ -122,22 +175,36 @@ static long tce_iommu_register_pages(struct tce_container *container,
 {
        long ret = 0;
        struct mm_iommu_table_group_mem_t *mem = NULL;
+       struct tce_iommu_prereg *tcemem;
        unsigned long entries = size >> PAGE_SHIFT;
 
        if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
                        ((vaddr + size) < vaddr))
                return -EINVAL;
 
-       ret = mm_iommu_get(vaddr, entries, &mem);
+       mem = mm_iommu_find(container->mm, vaddr, entries);
+       if (mem) {
+               list_for_each_entry(tcemem, &container->prereg_list, next) {
+                       if (tcemem->mem == mem)
+                               return -EBUSY;
+               }
+       }
+
+       ret = mm_iommu_get(container->mm, vaddr, entries, &mem);
        if (ret)
                return ret;
 
+       tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL);
+       tcemem->mem = mem;
+       list_add(&tcemem->next, &container->prereg_list);
+
        container->enabled = true;
 
        return 0;
 }
 
-static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl)
+static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl,
+               struct mm_struct *mm)
 {
        unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) *
                        tbl->it_size, PAGE_SIZE);
@@ -146,13 +213,13 @@ static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl)
 
        BUG_ON(tbl->it_userspace);
 
-       ret = try_increment_locked_vm(cb >> PAGE_SHIFT);
+       ret = try_increment_locked_vm(mm, cb >> PAGE_SHIFT);
        if (ret)
                return ret;
 
        uas = vzalloc(cb);
        if (!uas) {
-               decrement_locked_vm(cb >> PAGE_SHIFT);
+               decrement_locked_vm(mm, cb >> PAGE_SHIFT);
                return -ENOMEM;
        }
        tbl->it_userspace = uas;
@@ -160,7 +227,8 @@ static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl)
        return 0;
 }
 
-static void tce_iommu_userspace_view_free(struct iommu_table *tbl)
+static void tce_iommu_userspace_view_free(struct iommu_table *tbl,
+               struct mm_struct *mm)
 {
        unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) *
                        tbl->it_size, PAGE_SIZE);
@@ -170,7 +238,7 @@ static void tce_iommu_userspace_view_free(struct iommu_table *tbl)
 
        vfree(tbl->it_userspace);
        tbl->it_userspace = NULL;
-       decrement_locked_vm(cb >> PAGE_SHIFT);
+       decrement_locked_vm(mm, cb >> PAGE_SHIFT);
 }
 
 static bool tce_page_is_contained(struct page *page, unsigned page_shift)
@@ -230,9 +298,6 @@ static int tce_iommu_enable(struct tce_container *container)
        struct iommu_table_group *table_group;
        struct tce_iommu_group *tcegrp;
 
-       if (!current->mm)
-               return -ESRCH; /* process exited */
-
        if (container->enabled)
                return -EBUSY;
 
@@ -277,8 +342,12 @@ static int tce_iommu_enable(struct tce_container *container)
        if (!table_group->tce32_size)
                return -EPERM;
 
+       ret = tce_iommu_mm_set(container);
+       if (ret)
+               return ret;
+
        locked = table_group->tce32_size >> PAGE_SHIFT;
-       ret = try_increment_locked_vm(locked);
+       ret = try_increment_locked_vm(container->mm, locked);
        if (ret)
                return ret;
 
@@ -296,10 +365,8 @@ static void tce_iommu_disable(struct tce_container *container)
 
        container->enabled = false;
 
-       if (!current->mm)
-               return;
-
-       decrement_locked_vm(container->locked_pages);
+       BUG_ON(!container->mm);
+       decrement_locked_vm(container->mm, container->locked_pages);
 }
 
 static void *tce_iommu_open(unsigned long arg)
@@ -317,6 +384,7 @@ static void *tce_iommu_open(unsigned long arg)
 
        mutex_init(&container->lock);
        INIT_LIST_HEAD_RCU(&container->group_list);
+       INIT_LIST_HEAD_RCU(&container->prereg_list);
 
        container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
 
@@ -326,7 +394,8 @@ static void *tce_iommu_open(unsigned long arg)
 static int tce_iommu_clear(struct tce_container *container,
                struct iommu_table *tbl,
                unsigned long entry, unsigned long pages);
-static void tce_iommu_free_table(struct iommu_table *tbl);
+static void tce_iommu_free_table(struct tce_container *container,
+               struct iommu_table *tbl);
 
 static void tce_iommu_release(void *iommu_data)
 {
@@ -351,10 +420,20 @@ static void tce_iommu_release(void *iommu_data)
                        continue;
 
                tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
-               tce_iommu_free_table(tbl);
+               tce_iommu_free_table(container, tbl);
+       }
+
+       while (!list_empty(&container->prereg_list)) {
+               struct tce_iommu_prereg *tcemem;
+
+               tcemem = list_first_entry(&container->prereg_list,
+                               struct tce_iommu_prereg, next);
+               WARN_ON_ONCE(tce_iommu_prereg_free(container, tcemem));
        }
 
        tce_iommu_disable(container);
+       if (container->mm)
+               mmdrop(container->mm);
        mutex_destroy(&container->lock);
 
        kfree(container);
@@ -369,13 +448,14 @@ static void tce_iommu_unuse_page(struct tce_container *container,
        put_page(page);
 }
 
-static int tce_iommu_prereg_ua_to_hpa(unsigned long tce, unsigned long size,
+static int tce_iommu_prereg_ua_to_hpa(struct tce_container *container,
+               unsigned long tce, unsigned long size,
                unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem)
 {
        long ret = 0;
        struct mm_iommu_table_group_mem_t *mem;
 
-       mem = mm_iommu_lookup(tce, size);
+       mem = mm_iommu_lookup(container->mm, tce, size);
        if (!mem)
                return -EINVAL;
 
@@ -388,18 +468,18 @@ static int tce_iommu_prereg_ua_to_hpa(unsigned long tce, unsigned long size,
        return 0;
 }
 
-static void tce_iommu_unuse_page_v2(struct iommu_table *tbl,
-               unsigned long entry)
+static void tce_iommu_unuse_page_v2(struct tce_container *container,
+               struct iommu_table *tbl, unsigned long entry)
 {
        struct mm_iommu_table_group_mem_t *mem = NULL;
        int ret;
        unsigned long hpa = 0;
        unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
 
-       if (!pua || !current || !current->mm)
+       if (!pua)
                return;
 
-       ret = tce_iommu_prereg_ua_to_hpa(*pua, IOMMU_PAGE_SIZE(tbl),
+       ret = tce_iommu_prereg_ua_to_hpa(container, *pua, IOMMU_PAGE_SIZE(tbl),
                        &hpa, &mem);
        if (ret)
                pr_debug("%s: tce %lx at #%lx was not cached, ret=%d\n",
@@ -429,7 +509,7 @@ static int tce_iommu_clear(struct tce_container *container,
                        continue;
 
                if (container->v2) {
-                       tce_iommu_unuse_page_v2(tbl, entry);
+                       tce_iommu_unuse_page_v2(container, tbl, entry);
                        continue;
                }
 
@@ -509,13 +589,19 @@ static long tce_iommu_build_v2(struct tce_container *container,
        unsigned long hpa;
        enum dma_data_direction dirtmp;
 
+       if (!tbl->it_userspace) {
+               ret = tce_iommu_userspace_view_alloc(tbl, container->mm);
+               if (ret)
+                       return ret;
+       }
+
        for (i = 0; i < pages; ++i) {
                struct mm_iommu_table_group_mem_t *mem = NULL;
                unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl,
                                entry + i);
 
-               ret = tce_iommu_prereg_ua_to_hpa(tce, IOMMU_PAGE_SIZE(tbl),
-                               &hpa, &mem);
+               ret = tce_iommu_prereg_ua_to_hpa(container,
+                               tce, IOMMU_PAGE_SIZE(tbl), &hpa, &mem);
                if (ret)
                        break;
 
@@ -536,7 +622,7 @@ static long tce_iommu_build_v2(struct tce_container *container,
                ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
                if (ret) {
                        /* dirtmp cannot be DMA_NONE here */
-                       tce_iommu_unuse_page_v2(tbl, entry + i);
+                       tce_iommu_unuse_page_v2(container, tbl, entry + i);
                        pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
                                        __func__, entry << tbl->it_page_shift,
                                        tce, ret);
@@ -544,7 +630,7 @@ static long tce_iommu_build_v2(struct tce_container *container,
                }
 
                if (dirtmp != DMA_NONE)
-                       tce_iommu_unuse_page_v2(tbl, entry + i);
+                       tce_iommu_unuse_page_v2(container, tbl, entry + i);
 
                *pua = tce;
 
@@ -572,7 +658,7 @@ static long tce_iommu_create_table(struct tce_container *container,
        if (!table_size)
                return -EINVAL;
 
-       ret = try_increment_locked_vm(table_size >> PAGE_SHIFT);
+       ret = try_increment_locked_vm(container->mm, table_size >> PAGE_SHIFT);
        if (ret)
                return ret;
 
@@ -582,25 +668,17 @@ static long tce_iommu_create_table(struct tce_container *container,
        WARN_ON(!ret && !(*ptbl)->it_ops->free);
        WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size));
 
-       if (!ret && container->v2) {
-               ret = tce_iommu_userspace_view_alloc(*ptbl);
-               if (ret)
-                       (*ptbl)->it_ops->free(*ptbl);
-       }
-
-       if (ret)
-               decrement_locked_vm(table_size >> PAGE_SHIFT);
-
        return ret;
 }
 
-static void tce_iommu_free_table(struct iommu_table *tbl)
+static void tce_iommu_free_table(struct tce_container *container,
+               struct iommu_table *tbl)
 {
        unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
 
-       tce_iommu_userspace_view_free(tbl);
+       tce_iommu_userspace_view_free(tbl, container->mm);
        tbl->it_ops->free(tbl);
-       decrement_locked_vm(pages);
+       decrement_locked_vm(container->mm, pages);
 }
 
 static long tce_iommu_create_window(struct tce_container *container,
@@ -663,7 +741,7 @@ unset_exit:
                table_group = iommu_group_get_iommudata(tcegrp->grp);
                table_group->ops->unset_window(table_group, num);
        }
-       tce_iommu_free_table(tbl);
+       tce_iommu_free_table(container, tbl);
 
        return ret;
 }
@@ -701,12 +779,41 @@ static long tce_iommu_remove_window(struct tce_container *container,
 
        /* Free table */
        tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
-       tce_iommu_free_table(tbl);
+       tce_iommu_free_table(container, tbl);
        container->tables[num] = NULL;
 
        return 0;
 }
 
+static long tce_iommu_create_default_window(struct tce_container *container)
+{
+       long ret;
+       __u64 start_addr = 0;
+       struct tce_iommu_group *tcegrp;
+       struct iommu_table_group *table_group;
+
+       if (!container->def_window_pending)
+               return 0;
+
+       if (!tce_groups_attached(container))
+               return -ENODEV;
+
+       tcegrp = list_first_entry(&container->group_list,
+                       struct tce_iommu_group, next);
+       table_group = iommu_group_get_iommudata(tcegrp->grp);
+       if (!table_group)
+               return -ENODEV;
+
+       ret = tce_iommu_create_window(container, IOMMU_PAGE_SHIFT_4K,
+                       table_group->tce32_size, 1, &start_addr);
+       WARN_ON_ONCE(!ret && start_addr);
+
+       if (!ret)
+               container->def_window_pending = false;
+
+       return ret;
+}
+
 static long tce_iommu_ioctl(void *iommu_data,
                                 unsigned int cmd, unsigned long arg)
 {
@@ -727,7 +834,17 @@ static long tce_iommu_ioctl(void *iommu_data,
                }
 
                return (ret < 0) ? 0 : ret;
+       }
+
+       /*
+        * Sanity check to prevent one userspace from manipulating
+        * another userspace mm.
+        */
+       BUG_ON(!container);
+       if (container->mm && container->mm != current->mm)
+               return -EPERM;
 
+       switch (cmd) {
        case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
                struct vfio_iommu_spapr_tce_info info;
                struct tce_iommu_group *tcegrp;
@@ -797,6 +914,10 @@ static long tce_iommu_ioctl(void *iommu_data,
                                VFIO_DMA_MAP_FLAG_WRITE))
                        return -EINVAL;
 
+               ret = tce_iommu_create_default_window(container);
+               if (ret)
+                       return ret;
+
                num = tce_iommu_find_table(container, param.iova, &tbl);
                if (num < 0)
                        return -ENXIO;
@@ -860,6 +981,10 @@ static long tce_iommu_ioctl(void *iommu_data,
                if (param.flags)
                        return -EINVAL;
 
+               ret = tce_iommu_create_default_window(container);
+               if (ret)
+                       return ret;
+
                num = tce_iommu_find_table(container, param.iova, &tbl);
                if (num < 0)
                        return -ENXIO;
@@ -888,6 +1013,10 @@ static long tce_iommu_ioctl(void *iommu_data,
                minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
                                size);
 
+               ret = tce_iommu_mm_set(container);
+               if (ret)
+                       return ret;
+
                if (copy_from_user(&param, (void __user *)arg, minsz))
                        return -EFAULT;
 
@@ -911,6 +1040,9 @@ static long tce_iommu_ioctl(void *iommu_data,
                if (!container->v2)
                        break;
 
+               if (!container->mm)
+                       return -EPERM;
+
                minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
                                size);
 
@@ -969,6 +1101,10 @@ static long tce_iommu_ioctl(void *iommu_data,
                if (!container->v2)
                        break;
 
+               ret = tce_iommu_mm_set(container);
+               if (ret)
+                       return ret;
+
                if (!tce_groups_attached(container))
                        return -ENXIO;
 
@@ -986,6 +1122,10 @@ static long tce_iommu_ioctl(void *iommu_data,
 
                mutex_lock(&container->lock);
 
+               ret = tce_iommu_create_default_window(container);
+               if (ret)
+                       return ret;
+
                ret = tce_iommu_create_window(container, create.page_shift,
                                create.window_size, create.levels,
                                &create.start_addr);
@@ -1003,6 +1143,10 @@ static long tce_iommu_ioctl(void *iommu_data,
                if (!container->v2)
                        break;
 
+               ret = tce_iommu_mm_set(container);
+               if (ret)
+                       return ret;
+
                if (!tce_groups_attached(container))
                        return -ENXIO;
 
@@ -1018,6 +1162,11 @@ static long tce_iommu_ioctl(void *iommu_data,
                if (remove.flags)
                        return -EINVAL;
 
+               if (container->def_window_pending && !remove.start_addr) {
+                       container->def_window_pending = false;
+                       return 0;
+               }
+
                mutex_lock(&container->lock);
 
                ret = tce_iommu_remove_window(container, remove.start_addr);
@@ -1043,7 +1192,7 @@ static void tce_iommu_release_ownership(struct tce_container *container,
                        continue;
 
                tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
-               tce_iommu_userspace_view_free(tbl);
+               tce_iommu_userspace_view_free(tbl, container->mm);
                if (tbl->it_map)
                        iommu_release_ownership(tbl);
 
@@ -1062,10 +1211,7 @@ static int tce_iommu_take_ownership(struct tce_container *container,
                if (!tbl || !tbl->it_map)
                        continue;
 
-               rc = tce_iommu_userspace_view_alloc(tbl);
-               if (!rc)
-                       rc = iommu_take_ownership(tbl);
-
+               rc = iommu_take_ownership(tbl);
                if (rc) {
                        for (j = 0; j < i; ++j)
                                iommu_release_ownership(
@@ -1100,9 +1246,6 @@ static void tce_iommu_release_ownership_ddw(struct tce_container *container,
 static long tce_iommu_take_ownership_ddw(struct tce_container *container,
                struct iommu_table_group *table_group)
 {
-       long i, ret = 0;
-       struct iommu_table *tbl = NULL;
-
        if (!table_group->ops->create_table || !table_group->ops->set_window ||
                        !table_group->ops->release_ownership) {
                WARN_ON_ONCE(1);
@@ -1111,47 +1254,7 @@ static long tce_iommu_take_ownership_ddw(struct tce_container *container,
 
        table_group->ops->take_ownership(table_group);
 
-       /*
-        * If it the first group attached, check if there is
-        * a default DMA window and create one if none as
-        * the userspace expects it to exist.
-        */
-       if (!tce_groups_attached(container) && !container->tables[0]) {
-               ret = tce_iommu_create_table(container,
-                               table_group,
-                               0, /* window number */
-                               IOMMU_PAGE_SHIFT_4K,
-                               table_group->tce32_size,
-                               1, /* default levels */
-                               &tbl);
-               if (ret)
-                       goto release_exit;
-               else
-                       container->tables[0] = tbl;
-       }
-
-       /* Set all windows to the new group */
-       for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
-               tbl = container->tables[i];
-
-               if (!tbl)
-                       continue;
-
-               /* Set the default window to a new group */
-               ret = table_group->ops->set_window(table_group, i, tbl);
-               if (ret)
-                       goto release_exit;
-       }
-
        return 0;
-
-release_exit:
-       for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
-               table_group->ops->unset_window(table_group, i);
-
-       table_group->ops->release_ownership(table_group);
-
-       return ret;
 }
 
 static int tce_iommu_attach_group(void *iommu_data,
@@ -1203,10 +1306,13 @@ static int tce_iommu_attach_group(void *iommu_data,
        }
 
        if (!table_group->ops || !table_group->ops->take_ownership ||
-                       !table_group->ops->release_ownership)
+                       !table_group->ops->release_ownership) {
                ret = tce_iommu_take_ownership(container, table_group);
-       else
+       } else {
                ret = tce_iommu_take_ownership_ddw(container, table_group);
+               if (!tce_groups_attached(container) && !container->tables[0])
+                       container->def_window_pending = true;
+       }
 
        if (!ret) {
                tcegrp->grp = iommu_group;
index 406c33dcae137a62af7e7d78edf11938270a45c9..a33f63351f86f0ce530570a79ce0c4841e03e388 100644 (file)
@@ -148,7 +148,36 @@ struct kexec_file_ops {
        kexec_verify_sig_t *verify_sig;
 #endif
 };
-#endif
+
+/**
+ * struct kexec_buf - parameters for finding a place for a buffer in memory
+ * @image:     kexec image in which memory to search.
+ * @buffer:    Contents which will be copied to the allocated memory.
+ * @bufsz:     Size of @buffer.
+ * @mem:       On return will have address of the buffer in memory.
+ * @memsz:     Size for the buffer in memory.
+ * @buf_align: Minimum alignment needed.
+ * @buf_min:   The buffer can't be placed below this address.
+ * @buf_max:   The buffer can't be placed above this address.
+ * @top_down:  Allocate from top of memory.
+ */
+struct kexec_buf {
+       struct kimage *image;
+       void *buffer;
+       unsigned long bufsz;
+       unsigned long mem;
+       unsigned long memsz;
+       unsigned long buf_align;
+       unsigned long buf_min;
+       unsigned long buf_max;
+       bool top_down;
+};
+
+int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf,
+                              int (*func)(u64, u64, void *));
+extern int kexec_add_buffer(struct kexec_buf *kbuf);
+int kexec_locate_mem_hole(struct kexec_buf *kbuf);
+#endif /* CONFIG_KEXEC_FILE */
 
 struct kimage {
        kimage_entry_t head;
@@ -212,11 +241,6 @@ extern asmlinkage long sys_kexec_load(unsigned long entry,
                                        struct kexec_segment __user *segments,
                                        unsigned long flags);
 extern int kernel_kexec(void);
-extern int kexec_add_buffer(struct kimage *image, char *buffer,
-                           unsigned long bufsz, unsigned long memsz,
-                           unsigned long buf_align, unsigned long buf_min,
-                           unsigned long buf_max, bool top_down,
-                           unsigned long *load_addr);
 extern struct page *kimage_alloc_control_pages(struct kimage *image,
                                                unsigned int order);
 extern int kexec_load_purgatory(struct kimage *image, unsigned long min,
index 037c321c56188f9fbb35193072f08052d6e4771f..0c2df7f737925b6d0bd5bf624ceb12e8a6d53d86 100644 (file)
@@ -428,25 +428,65 @@ static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
        return locate_mem_hole_bottom_up(start, end, kbuf);
 }
 
-/*
- * Helper function for placing a buffer in a kexec segment. This assumes
- * that kexec_mutex is held.
+/**
+ * arch_kexec_walk_mem - call func(data) on free memory regions
+ * @kbuf:      Context info for the search. Also passed to @func.
+ * @func:      Function to call for each memory region.
+ *
+ * Return: The memory walk will stop when func returns a non-zero value
+ * and that value will be returned. If all free regions are visited without
+ * func returning non-zero, then zero will be returned.
+ */
+int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf,
+                              int (*func)(u64, u64, void *))
+{
+       if (kbuf->image->type == KEXEC_TYPE_CRASH)
+               return walk_iomem_res_desc(crashk_res.desc,
+                                          IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
+                                          crashk_res.start, crashk_res.end,
+                                          kbuf, func);
+       else
+               return walk_system_ram_res(0, ULONG_MAX, kbuf, func);
+}
+
+/**
+ * kexec_locate_mem_hole - find free memory for the purgatory or the next kernel
+ * @kbuf:      Parameters for the memory search.
+ *
+ * On success, kbuf->mem will have the start address of the memory region found.
+ *
+ * Return: 0 on success, negative errno on error.
  */
-int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
-                    unsigned long memsz, unsigned long buf_align,
-                    unsigned long buf_min, unsigned long buf_max,
-                    bool top_down, unsigned long *load_addr)
+int kexec_locate_mem_hole(struct kexec_buf *kbuf)
+{
+       int ret;
+
+       ret = arch_kexec_walk_mem(kbuf, locate_mem_hole_callback);
+
+       return ret == 1 ? 0 : -EADDRNOTAVAIL;
+}
+
+/**
+ * kexec_add_buffer - place a buffer in a kexec segment
+ * @kbuf:      Buffer contents and memory parameters.
+ *
+ * This function assumes that kexec_mutex is held.
+ * On successful return, @kbuf->mem will have the physical address of
+ * the buffer in memory.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int kexec_add_buffer(struct kexec_buf *kbuf)
 {
 
        struct kexec_segment *ksegment;
-       struct kexec_buf buf, *kbuf;
        int ret;
 
        /* Currently adding segment this way is allowed only in file mode */
-       if (!image->file_mode)
+       if (!kbuf->image->file_mode)
                return -EINVAL;
 
-       if (image->nr_segments >= KEXEC_SEGMENT_MAX)
+       if (kbuf->image->nr_segments >= KEXEC_SEGMENT_MAX)
                return -EINVAL;
 
        /*
@@ -456,45 +496,27 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
         * logic goes through list of segments to make sure there are
         * no destination overlaps.
         */
-       if (!list_empty(&image->control_pages)) {
+       if (!list_empty(&kbuf->image->control_pages)) {
                WARN_ON(1);
                return -EINVAL;
        }
 
-       memset(&buf, 0, sizeof(struct kexec_buf));
-       kbuf = &buf;
-       kbuf->image = image;
-       kbuf->buffer = buffer;
-       kbuf->bufsz = bufsz;
-
-       kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
-       kbuf->buf_align = max(buf_align, PAGE_SIZE);
-       kbuf->buf_min = buf_min;
-       kbuf->buf_max = buf_max;
-       kbuf->top_down = top_down;
+       /* Ensure minimum alignment needed for segments. */
+       kbuf->memsz = ALIGN(kbuf->memsz, PAGE_SIZE);
+       kbuf->buf_align = max(kbuf->buf_align, PAGE_SIZE);
 
        /* Walk the RAM ranges and allocate a suitable range for the buffer */
-       if (image->type == KEXEC_TYPE_CRASH)
-               ret = walk_iomem_res_desc(crashk_res.desc,
-                               IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
-                               crashk_res.start, crashk_res.end, kbuf,
-                               locate_mem_hole_callback);
-       else
-               ret = walk_system_ram_res(0, -1, kbuf,
-                                         locate_mem_hole_callback);
-       if (ret != 1) {
-               /* A suitable memory range could not be found for buffer */
-               return -EADDRNOTAVAIL;
-       }
+       ret = kexec_locate_mem_hole(kbuf);
+       if (ret)
+               return ret;
 
        /* Found a suitable memory range */
-       ksegment = &image->segment[image->nr_segments];
+       ksegment = &kbuf->image->segment[kbuf->image->nr_segments];
        ksegment->kbuf = kbuf->buffer;
        ksegment->bufsz = kbuf->bufsz;
        ksegment->mem = kbuf->mem;
        ksegment->memsz = kbuf->memsz;
-       image->nr_segments++;
-       *load_addr = ksegment->mem;
+       kbuf->image->nr_segments++;
        return 0;
 }
 
@@ -616,13 +638,15 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
                                  unsigned long max, int top_down)
 {
        struct purgatory_info *pi = &image->purgatory_info;
-       unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
-       unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
+       unsigned long align, bss_align, bss_sz, bss_pad;
+       unsigned long entry, load_addr, curr_load_addr, bss_addr, offset;
        unsigned char *buf_addr, *src;
        int i, ret = 0, entry_sidx = -1;
        const Elf_Shdr *sechdrs_c;
        Elf_Shdr *sechdrs = NULL;
-       void *purgatory_buf = NULL;
+       struct kexec_buf kbuf = { .image = image, .bufsz = 0, .buf_align = 1,
+                                 .buf_min = min, .buf_max = max,
+                                 .top_down = top_down };
 
        /*
         * sechdrs_c points to section headers in purgatory and are read
@@ -688,9 +712,7 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
        }
 
        /* Determine how much memory is needed to load relocatable object. */
-       buf_align = 1;
        bss_align = 1;
-       buf_sz = 0;
        bss_sz = 0;
 
        for (i = 0; i < pi->ehdr->e_shnum; i++) {
@@ -699,10 +721,10 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
 
                align = sechdrs[i].sh_addralign;
                if (sechdrs[i].sh_type != SHT_NOBITS) {
-                       if (buf_align < align)
-                               buf_align = align;
-                       buf_sz = ALIGN(buf_sz, align);
-                       buf_sz += sechdrs[i].sh_size;
+                       if (kbuf.buf_align < align)
+                               kbuf.buf_align = align;
+                       kbuf.bufsz = ALIGN(kbuf.bufsz, align);
+                       kbuf.bufsz += sechdrs[i].sh_size;
                } else {
                        /* bss section */
                        if (bss_align < align)
@@ -714,32 +736,31 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
 
        /* Determine the bss padding required to align bss properly */
        bss_pad = 0;
-       if (buf_sz & (bss_align - 1))
-               bss_pad = bss_align - (buf_sz & (bss_align - 1));
+       if (kbuf.bufsz & (bss_align - 1))
+               bss_pad = bss_align - (kbuf.bufsz & (bss_align - 1));
 
-       memsz = buf_sz + bss_pad + bss_sz;
+       kbuf.memsz = kbuf.bufsz + bss_pad + bss_sz;
 
        /* Allocate buffer for purgatory */
-       purgatory_buf = vzalloc(buf_sz);
-       if (!purgatory_buf) {
+       kbuf.buffer = vzalloc(kbuf.bufsz);
+       if (!kbuf.buffer) {
                ret = -ENOMEM;
                goto out;
        }
 
-       if (buf_align < bss_align)
-               buf_align = bss_align;
+       if (kbuf.buf_align < bss_align)
+               kbuf.buf_align = bss_align;
 
        /* Add buffer to segment list */
-       ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
-                               buf_align, min, max, top_down,
-                               &pi->purgatory_load_addr);
+       ret = kexec_add_buffer(&kbuf);
        if (ret)
                goto out;
+       pi->purgatory_load_addr = kbuf.mem;
 
        /* Load SHF_ALLOC sections */
-       buf_addr = purgatory_buf;
+       buf_addr = kbuf.buffer;
        load_addr = curr_load_addr = pi->purgatory_load_addr;
-       bss_addr = load_addr + buf_sz + bss_pad;
+       bss_addr = load_addr + kbuf.bufsz + bss_pad;
 
        for (i = 0; i < pi->ehdr->e_shnum; i++) {
                if (!(sechdrs[i].sh_flags & SHF_ALLOC))
@@ -785,11 +806,11 @@ static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
         * Used later to identify which section is purgatory and skip it
         * from checksumming.
         */
-       pi->purgatory_buf = purgatory_buf;
+       pi->purgatory_buf = kbuf.buffer;
        return ret;
 out:
        vfree(sechdrs);
-       vfree(purgatory_buf);
+       vfree(kbuf.buffer);
        return ret;
 }
 
index 0a52315d9c626abf5e17db2b348291e09efb6e15..4cef7e4706b098d7918b53ff1e1b931d1a5ec8dc 100644 (file)
@@ -20,22 +20,6 @@ struct kexec_sha_region {
        unsigned long len;
 };
 
-/*
- * Keeps track of buffer parameters as provided by caller for requesting
- * memory placement of buffer.
- */
-struct kexec_buf {
-       struct kimage *image;
-       char *buffer;
-       unsigned long bufsz;
-       unsigned long mem;
-       unsigned long memsz;
-       unsigned long buf_align;
-       unsigned long buf_min;
-       unsigned long buf_max;
-       bool top_down;          /* allocate from top of memory hole */
-};
-
 void kimage_file_post_load_cleanup(struct kimage *image);
 #else /* CONFIG_KEXEC_FILE */
 static inline void kimage_file_post_load_cleanup(struct kimage *image) { }